Commit 8d6a2a01 authored by Peter Eastman's avatar Peter Eastman
Browse files

Beginnings of mixed/double precision support in OpenCL

parent a3d5f834
......@@ -68,11 +68,18 @@ public:
static const std::string key = "OpenCLPlatformIndex";
return key;
}
/**
* This is the name of the parameter for selecting what numerical precision to use.
*/
static const std::string& OpenCLPrecision() {
static const std::string key = "OpenCLPrecision";
return key;
}
};
class OPENMM_EXPORT OpenCLPlatform::PlatformData {
public:
PlatformData(const System& system, const std::string& platformPropValue, const std::string& deviceIndexProperty);
PlatformData(const System& system, const std::string& platformPropValue, const std::string& deviceIndexProperty, const std::string& precisionProperty);
~PlatformData();
void initializeContexts(const System& system);
void syncContexts();
......
This diff is collapsed.
......@@ -62,7 +62,7 @@ struct mm_float2 {
mm_float2(cl_float x, cl_float y) : x(x), y(y) {
}
};
struct mm_float4 {
struct mm_float4 {
cl_float x, y, z, w;
mm_float4() {
}
......@@ -87,6 +87,20 @@ struct mm_float16 {
s8(s8), s9(s9), s10(s10), s11(s11), s12(s12), s13(s13), s14(s14), s15(15) {
}
};
struct mm_double2 {
cl_double x, y;
mm_double2() {
}
mm_double2(cl_double x, cl_double y) : x(x), y(y) {
}
};
struct mm_double4 {
cl_double x, y, z, w;
mm_double4() {
}
mm_double4(cl_double x, cl_double y, cl_double z, cl_double w) : x(x), y(y), z(z), w(w) {
}
};
struct mm_ushort2 {
cl_ushort x, y;
mm_ushort2() {
......@@ -145,7 +159,7 @@ public:
class ReorderListener;
static const int ThreadBlockSize;
static const int TileSize;
OpenCLContext(const System& system, int platformIndex, int deviceIndex, OpenCLPlatform::PlatformData& platformData);
OpenCLContext(const System& system, int platformIndex, int deviceIndex, const std::string& precision, OpenCLPlatform::PlatformData& platformData);
~OpenCLContext();
/**
* This is called to initialize internal data structures after all Forces in the system
......@@ -198,6 +212,12 @@ public:
OpenCLArray& getPosq() {
return *posq;
}
/**
* Get the array which contains a correction to the position of each atom. This only exists if getUseMixedPrecision() returns true.
*/
OpenCLArray& getPosqCorrection() {
return *posqCorrection;
}
/**
* Get the array which contains the velocity (the xyz components) and inverse mass (the w component) of each atom.
*/
......@@ -405,18 +425,38 @@ public:
bool getSupportsDoublePrecision() {
return supportsDoublePrecision;
}
/**
* Get whether double precision is being used.
*/
bool getUseDoublePrecision() {
return useDoublePrecision;
}
/**
* Get whether mixed precision is being used.
*/
bool getUseMixedPrecision() {
return useMixedPrecision;
}
/**
* Get the size of the periodic box.
*/
mm_float4 getPeriodicBoxSize() const {
return periodicBoxSize;
}
/**
* Get the size of the periodic box.
*/
mm_double4 getPeriodicBoxSizeDouble() const {
return periodicBoxSizeDouble;
}
/**
* Set the size of the periodic box.
*/
void setPeriodicBoxSize(double xsize, double ysize, double zsize) {
periodicBoxSize = mm_float4((float) xsize, (float) ysize, (float) zsize, 0);
invPeriodicBoxSize = mm_float4((float) (1.0/xsize), (float) (1.0/ysize), (float) (1.0/zsize), 0);
periodicBoxSizeDouble = mm_double4(xsize, ysize, zsize, 0);
invPeriodicBoxSizeDouble = mm_double4(1.0/xsize, 1.0/ysize, 1.0/zsize, 0);
}
/**
* Get the inverse of the size of the periodic box.
......@@ -424,6 +464,12 @@ public:
mm_float4 getInvPeriodicBoxSize() const {
return invPeriodicBoxSize;
}
/**
* Get the inverse of the size of the periodic box.
*/
mm_double4 getInvPeriodicBoxSizeDouble() const {
return invPeriodicBoxSizeDouble;
}
/**
* Get the OpenCLIntegrationUtilities for this context.
*/
......@@ -502,6 +548,11 @@ private:
* of molecules and resort the atoms.
*/
void validateMolecules();
/**
* This is the internal implementation of reorderAtoms(), templatized by the numerical precision in use.
*/
template <class Real, class Real4, class Mixed, class Mixed4>
void reorderAtomsImpl(bool enforcePeriodic);
const System& system;
double time;
OpenCLPlatform::PlatformData& platformData;
......@@ -515,9 +566,9 @@ private:
int numThreadBlocks;
int numForceBuffers;
int simdWidth;
bool supports64BitGlobalAtomics, supportsDoublePrecision, atomsWereReordered, moleculesInvalid;
mm_float4 periodicBoxSize;
mm_float4 invPeriodicBoxSize;
bool supports64BitGlobalAtomics, supportsDoublePrecision, useDoublePrecision, useMixedPrecision, atomsWereReordered, moleculesInvalid;
mm_float4 periodicBoxSize, invPeriodicBoxSize;
mm_double4 periodicBoxSizeDouble, invPeriodicBoxSizeDouble;
std::string defaultOptimizationOptions;
std::map<std::string, std::string> compilationDefines;
cl::Context context;
......@@ -538,6 +589,7 @@ private:
cl::Buffer* pinnedBuffer;
void* pinnedMemory;
OpenCLArray* posq;
OpenCLArray* posqCorrection;
OpenCLArray* velm;
OpenCLArray* force;
OpenCLArray* forceBuffers;
......
......@@ -87,6 +87,13 @@ struct OpenCLIntegrationUtilities::ConstraintOrderer : public binary_function<in
}
};
static void setPosqCorrectionArg(OpenCLContext& cl, cl::Kernel& kernel, int index) {
if (cl.getUseMixedPrecision())
kernel.setArg<cl::Buffer>(index, cl.getPosqCorrection().getDeviceBuffer());
else
kernel.setArg<void*>(index, NULL);
}
OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, const System& system) : context(context),
posDelta(NULL), settleAtoms(NULL), settleParams(NULL), shakeAtoms(NULL), shakeParams(NULL),
random(NULL), randomSeed(NULL), randomPos(0), stepSize(NULL), ccmaAtoms(NULL), ccmaDistance(NULL),
......@@ -96,12 +103,22 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
vsiteOutOfPlaneAtoms(NULL), vsiteOutOfPlaneWeights(NULL), hasInitializedPosConstraintKernels(false), hasInitializedVelConstraintKernels(false) {
// Create workspace arrays.
posDelta = OpenCLArray::create<mm_float4>(context, context.getPaddedNumAtoms(), "posDelta");
vector<mm_float4> deltas(posDelta->getSize(), mm_float4(0.0, 0.0, 0.0, 0.0));
posDelta->upload(deltas);
stepSize = OpenCLArray::create<mm_float2>(context, 1, "stepSize");
vector<mm_float2> step(1, mm_float2(0.0f, 0.0f));
stepSize->upload(step);
if (context.getUseDoublePrecision() || context.getUseMixedPrecision()) {
posDelta = OpenCLArray::create<mm_double4>(context, context.getPaddedNumAtoms(), "posDelta");
vector<mm_double4> deltas(posDelta->getSize(), mm_double4(0.0, 0.0, 0.0, 0.0));
posDelta->upload(deltas);
stepSize = OpenCLArray::create<mm_double2>(context, 1, "stepSize");
vector<mm_double2> step(1, mm_double2(0.0, 0.0));
stepSize->upload(step);
}
else {
posDelta = OpenCLArray::create<mm_float4>(context, context.getPaddedNumAtoms(), "posDelta");
vector<mm_float4> deltas(posDelta->getSize(), mm_float4(0.0f, 0.0f, 0.0f, 0.0f));
posDelta->upload(deltas);
stepSize = OpenCLArray::create<mm_float2>(context, 1, "stepSize");
vector<mm_float2> step(1, mm_float2(0.0f, 0.0f));
stepSize->upload(step);
}
// Create kernels for enforcing constraints.
......@@ -458,51 +475,86 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
// Record the CCMA data structures.
ccmaAtoms = OpenCLArray::create<mm_int2>(context, numCCMA, "CcmaAtoms");
ccmaDistance = OpenCLArray::create<mm_float4>(context, numCCMA, "CcmaDistance");
ccmaAtomConstraints = OpenCLArray::create<cl_int>(context, numAtoms*maxAtomConstraints, "CcmaAtomConstraints");
ccmaNumAtomConstraints = OpenCLArray::create<cl_int>(context, numAtoms, "CcmaAtomConstraintsIndex");
ccmaDelta1 = OpenCLArray::create<cl_float>(context, numCCMA, "CcmaDelta1");
ccmaDelta2 = OpenCLArray::create<cl_float>(context, numCCMA, "CcmaDelta2");
ccmaConstraintMatrixColumn = OpenCLArray::create<cl_int>(context, numCCMA*maxRowElements, "ConstraintMatrixColumn");
ccmaConverged = OpenCLArray::create<cl_int>(context, 2, "CcmaConverged");
ccmaConvergedBuffer = new cl::Buffer(context.getContext(), CL_MEM_ALLOC_HOST_PTR, 2*sizeof(cl_int));
ccmaConvergedMemory = (cl_int*) context.getQueue().enqueueMapBuffer(*ccmaConvergedBuffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, 2*sizeof(cl_int));
ccmaReducedMass = OpenCLArray::create<cl_float>(context, numCCMA, "CcmaReducedMass");
ccmaConstraintMatrixColumn = OpenCLArray::create<cl_int>(context, numCCMA*maxRowElements, "ConstraintMatrixColumn");
ccmaConstraintMatrixValue = OpenCLArray::create<cl_float>(context, numCCMA*maxRowElements, "ConstraintMatrixValue");
vector<mm_int2> atomsVec(ccmaAtoms->getSize());
vector<mm_float4> distanceVec(ccmaDistance->getSize());
vector<cl_int> atomConstraintsVec(ccmaAtomConstraints->getSize());
vector<cl_int> numAtomConstraintsVec(ccmaNumAtomConstraints->getSize());
vector<cl_float> reducedMassVec(ccmaReducedMass->getSize());
vector<cl_int> constraintMatrixColumnVec(ccmaConstraintMatrixColumn->getSize());
vector<cl_float> constraintMatrixValueVec(ccmaConstraintMatrixValue->getSize());
for (int i = 0; i < numCCMA; i++) {
int index = constraintOrder[i];
int c = ccmaConstraints[index];
atomsVec[i].x = atom1[c];
atomsVec[i].y = atom2[c];
distanceVec[i].w = (float) distance[c];
reducedMassVec[i] = (float) (0.5/(1.0/system.getParticleMass(atom1[c])+1.0/system.getParticleMass(atom2[c])));
for (unsigned int j = 0; j < matrix[index].size(); j++) {
constraintMatrixColumnVec[i+j*numCCMA] = matrix[index][j].first;
constraintMatrixValueVec[i+j*numCCMA] = (float) matrix[index][j].second;
if (context.getUseDoublePrecision() || context.getUseMixedPrecision()) {
ccmaDistance = OpenCLArray::create<mm_double4>(context, numCCMA, "CcmaDistance");
ccmaDelta1 = OpenCLArray::create<cl_double>(context, numCCMA, "CcmaDelta1");
ccmaDelta2 = OpenCLArray::create<cl_double>(context, numCCMA, "CcmaDelta2");
ccmaReducedMass = OpenCLArray::create<cl_double>(context, numCCMA, "CcmaReducedMass");
ccmaConstraintMatrixValue = OpenCLArray::create<cl_double>(context, numCCMA*maxRowElements, "ConstraintMatrixValue");
vector<mm_double4> distanceVec(ccmaDistance->getSize());
vector<cl_double> reducedMassVec(ccmaReducedMass->getSize());
vector<cl_double> constraintMatrixValueVec(ccmaConstraintMatrixValue->getSize());
for (int i = 0; i < numCCMA; i++) {
int index = constraintOrder[i];
int c = ccmaConstraints[index];
atomsVec[i].x = atom1[c];
atomsVec[i].y = atom2[c];
distanceVec[i].w = distance[c];
reducedMassVec[i] = (0.5/(1.0/system.getParticleMass(atom1[c])+1.0/system.getParticleMass(atom2[c])));
for (unsigned int j = 0; j < matrix[index].size(); j++) {
constraintMatrixColumnVec[i+j*numCCMA] = matrix[index][j].first;
constraintMatrixValueVec[i+j*numCCMA] = matrix[index][j].second;
}
constraintMatrixColumnVec[i+matrix[index].size()*numCCMA] = numCCMA;
}
for (unsigned int i = 0; i < atomConstraints.size(); i++) {
numAtomConstraintsVec[i] = atomConstraints[i].size();
for (unsigned int j = 0; j < atomConstraints[i].size(); j++) {
bool forward = (atom1[ccmaConstraints[atomConstraints[i][j]]] == i);
atomConstraintsVec[i+j*numAtoms] = (forward ? inverseOrder[atomConstraints[i][j]]+1 : -inverseOrder[atomConstraints[i][j]]-1);
}
}
constraintMatrixColumnVec[i+matrix[index].size()*numCCMA] = numCCMA;
ccmaDistance->upload(distanceVec);
ccmaReducedMass->upload(reducedMassVec);
ccmaConstraintMatrixValue->upload(constraintMatrixValueVec);
}
for (unsigned int i = 0; i < atomConstraints.size(); i++) {
numAtomConstraintsVec[i] = atomConstraints[i].size();
for (unsigned int j = 0; j < atomConstraints[i].size(); j++) {
bool forward = (atom1[ccmaConstraints[atomConstraints[i][j]]] == i);
atomConstraintsVec[i+j*numAtoms] = (forward ? inverseOrder[atomConstraints[i][j]]+1 : -inverseOrder[atomConstraints[i][j]]-1);
else {
ccmaDistance = OpenCLArray::create<mm_float4>(context, numCCMA, "CcmaDistance");
ccmaDelta1 = OpenCLArray::create<cl_float>(context, numCCMA, "CcmaDelta1");
ccmaDelta2 = OpenCLArray::create<cl_float>(context, numCCMA, "CcmaDelta2");
ccmaReducedMass = OpenCLArray::create<cl_float>(context, numCCMA, "CcmaReducedMass");
ccmaConstraintMatrixValue = OpenCLArray::create<cl_float>(context, numCCMA*maxRowElements, "ConstraintMatrixValue");
vector<mm_float4> distanceVec(ccmaDistance->getSize());
vector<cl_float> reducedMassVec(ccmaReducedMass->getSize());
vector<cl_float> constraintMatrixValueVec(ccmaConstraintMatrixValue->getSize());
for (int i = 0; i < numCCMA; i++) {
int index = constraintOrder[i];
int c = ccmaConstraints[index];
atomsVec[i].x = atom1[c];
atomsVec[i].y = atom2[c];
distanceVec[i].w = (float) distance[c];
reducedMassVec[i] = (float) (0.5/(1.0/system.getParticleMass(atom1[c])+1.0/system.getParticleMass(atom2[c])));
for (unsigned int j = 0; j < matrix[index].size(); j++) {
constraintMatrixColumnVec[i+j*numCCMA] = matrix[index][j].first;
constraintMatrixValueVec[i+j*numCCMA] = (float) matrix[index][j].second;
}
constraintMatrixColumnVec[i+matrix[index].size()*numCCMA] = numCCMA;
}
for (unsigned int i = 0; i < atomConstraints.size(); i++) {
numAtomConstraintsVec[i] = atomConstraints[i].size();
for (unsigned int j = 0; j < atomConstraints[i].size(); j++) {
bool forward = (atom1[ccmaConstraints[atomConstraints[i][j]]] == i);
atomConstraintsVec[i+j*numAtoms] = (forward ? inverseOrder[atomConstraints[i][j]]+1 : -inverseOrder[atomConstraints[i][j]]-1);
}
}
ccmaDistance->upload(distanceVec);
ccmaReducedMass->upload(reducedMassVec);
ccmaConstraintMatrixValue->upload(constraintMatrixValueVec);
}
ccmaAtoms->upload(atomsVec);
ccmaDistance->upload(distanceVec);
ccmaAtomConstraints->upload(atomConstraintsVec);
ccmaNumAtomConstraints->upload(numAtomConstraintsVec);
ccmaReducedMass->upload(reducedMassVec);
ccmaConstraintMatrixColumn->upload(constraintMatrixColumnVec);
ccmaConstraintMatrixValue->upload(constraintMatrixValueVec);
// Create the CCMA kernels.
......@@ -584,21 +636,23 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
cl::Program vsiteProgram = context.createProgram(OpenCLKernelSources::virtualSites, defines);
vsitePositionKernel = cl::Kernel(vsiteProgram, "computeVirtualSites");
vsitePositionKernel.setArg<cl::Buffer>(0, context.getPosq().getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(1, vsite2AvgAtoms->getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(2, vsite2AvgWeights->getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(3, vsite3AvgAtoms->getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(4, vsite3AvgWeights->getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(5, vsiteOutOfPlaneAtoms->getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(6, vsiteOutOfPlaneWeights->getDeviceBuffer());
setPosqCorrectionArg(context, vsitePositionKernel, 1);
vsitePositionKernel.setArg<cl::Buffer>(2, vsite2AvgAtoms->getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(3, vsite2AvgWeights->getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(4, vsite3AvgAtoms->getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(5, vsite3AvgWeights->getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(6, vsiteOutOfPlaneAtoms->getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(7, vsiteOutOfPlaneWeights->getDeviceBuffer());
vsiteForceKernel = cl::Kernel(vsiteProgram, "distributeForces");
vsiteForceKernel.setArg<cl::Buffer>(0, context.getPosq().getDeviceBuffer());
// Skip argument 1: the force array hasn't been created yet.
vsiteForceKernel.setArg<cl::Buffer>(2, vsite2AvgAtoms->getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(3, vsite2AvgWeights->getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(4, vsite3AvgAtoms->getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(5, vsite3AvgWeights->getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(6, vsiteOutOfPlaneAtoms->getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(7, vsiteOutOfPlaneWeights->getDeviceBuffer());
setPosqCorrectionArg(context, vsiteForceKernel, 1);
// Skip argument 2: the force array hasn't been created yet.
vsiteForceKernel.setArg<cl::Buffer>(3, vsite2AvgAtoms->getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(4, vsite2AvgWeights->getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(5, vsite3AvgAtoms->getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(6, vsite3AvgWeights->getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(7, vsiteOutOfPlaneAtoms->getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(8, vsiteOutOfPlaneWeights->getDeviceBuffer());
numVsites = num2Avg+num3Avg+numOutOfPlane;
}
......@@ -686,23 +740,37 @@ void OpenCLIntegrationUtilities::applyConstraints(bool constrainVelocities, doub
if (!hasInitialized) {
settleKernel.setArg<cl_int>(0, settleAtoms->getSize());
settleKernel.setArg<cl::Buffer>(2, context.getPosq().getDeviceBuffer());
settleKernel.setArg<cl::Buffer>(3, posDelta->getDeviceBuffer());
settleKernel.setArg<cl::Buffer>(4, context.getVelm().getDeviceBuffer());
settleKernel.setArg<cl::Buffer>(5, settleAtoms->getDeviceBuffer());
settleKernel.setArg<cl::Buffer>(6, settleParams->getDeviceBuffer());
if (context.getUseMixedPrecision())
settleKernel.setArg<cl::Buffer>(3, context.getPosqCorrection().getDeviceBuffer());
else
settleKernel.setArg<void*>(3, NULL);
settleKernel.setArg<cl::Buffer>(4, posDelta->getDeviceBuffer());
settleKernel.setArg<cl::Buffer>(5, context.getVelm().getDeviceBuffer());
settleKernel.setArg<cl::Buffer>(6, settleAtoms->getDeviceBuffer());
settleKernel.setArg<cl::Buffer>(7, settleParams->getDeviceBuffer());
}
settleKernel.setArg<cl_float>(1, (cl_float) tol);
if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
settleKernel.setArg<cl_double>(1, (cl_double) tol);
else
settleKernel.setArg<cl_float>(1, (cl_float) tol);
context.executeKernel(settleKernel, settleAtoms->getSize());
}
if (shakeAtoms != NULL) {
if (!hasInitialized) {
shakeKernel.setArg<cl_int>(0, shakeAtoms->getSize());
shakeKernel.setArg<cl::Buffer>(2, context.getPosq().getDeviceBuffer());
shakeKernel.setArg<cl::Buffer>(3, constrainVelocities ? context.getVelm().getDeviceBuffer() : posDelta->getDeviceBuffer());
shakeKernel.setArg<cl::Buffer>(4, shakeAtoms->getDeviceBuffer());
shakeKernel.setArg<cl::Buffer>(5, shakeParams->getDeviceBuffer());
if (context.getUseMixedPrecision())
shakeKernel.setArg<cl::Buffer>(3, context.getPosqCorrection().getDeviceBuffer());
else
shakeKernel.setArg<void*>(3, NULL);
shakeKernel.setArg<cl::Buffer>(4, constrainVelocities ? context.getVelm().getDeviceBuffer() : posDelta->getDeviceBuffer());
shakeKernel.setArg<cl::Buffer>(5, shakeAtoms->getDeviceBuffer());
shakeKernel.setArg<cl::Buffer>(6, shakeParams->getDeviceBuffer());
}
shakeKernel.setArg<cl_float>(1, (cl_float) tol);
if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
shakeKernel.setArg<cl_double>(1, (cl_double) tol);
else
shakeKernel.setArg<cl_float>(1, (cl_float) tol);
context.executeKernel(shakeKernel, shakeAtoms->getSize());
}
if (ccmaAtoms != NULL) {
......@@ -710,6 +778,10 @@ void OpenCLIntegrationUtilities::applyConstraints(bool constrainVelocities, doub
ccmaDirectionsKernel.setArg<cl::Buffer>(0, ccmaAtoms->getDeviceBuffer());
ccmaDirectionsKernel.setArg<cl::Buffer>(1, ccmaDistance->getDeviceBuffer());
ccmaDirectionsKernel.setArg<cl::Buffer>(2, context.getPosq().getDeviceBuffer());
if (context.getUseMixedPrecision())
ccmaDirectionsKernel.setArg<cl::Buffer>(3, context.getPosqCorrection().getDeviceBuffer());
else
ccmaDirectionsKernel.setArg<void*>(3, NULL);
ccmaForceKernel.setArg<cl::Buffer>(0, ccmaAtoms->getDeviceBuffer());
ccmaForceKernel.setArg<cl::Buffer>(1, ccmaDistance->getDeviceBuffer());
ccmaForceKernel.setArg<cl::Buffer>(2, constrainVelocities ? context.getVelm().getDeviceBuffer() : posDelta->getDeviceBuffer());
......@@ -730,7 +802,10 @@ void OpenCLIntegrationUtilities::applyConstraints(bool constrainVelocities, doub
ccmaUpdateKernel.setArg<cl::Buffer>(6, ccmaDelta2->getDeviceBuffer());
ccmaUpdateKernel.setArg<cl::Buffer>(7, ccmaConverged->getDeviceBuffer());
}
ccmaForceKernel.setArg<cl_float>(6, (cl_float) tol);
if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
ccmaForceKernel.setArg<cl_double>(6, (cl_double) tol);
else
ccmaForceKernel.setArg<cl_float>(6, (cl_float) tol);
context.executeKernel(ccmaDirectionsKernel, ccmaAtoms->getSize());
const int checkInterval = 4;
cl::Event event;
......@@ -764,7 +839,7 @@ void OpenCLIntegrationUtilities::computeVirtualSites() {
void OpenCLIntegrationUtilities::distributeForcesFromVirtualSites() {
if (numVsites > 0) {
vsiteForceKernel.setArg<cl::Buffer>(1, context.getForce().getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(2, context.getForce().getDeviceBuffer());
context.executeKernel(vsiteForceKernel, numVsites);
}
}
......
This diff is collapsed.
......@@ -1145,7 +1145,10 @@ private:
OpenCLArray* uniformRandoms;
OpenCLArray* randomSeed;
OpenCLParameterSet* perDofValues;
mutable std::vector<std::vector<cl_float> > localPerDofValues;
mutable std::vector<std::vector<cl_float> > localPerDofValuesFloat;
mutable std::vector<std::vector<cl_double> > localPerDofValuesDouble;
std::vector<float> contextValuesFloat;
std::vector<double> contextValuesDouble;
std::vector<float> contextValues;
std::vector<std::vector<cl::Kernel> > kernels;
cl::Kernel sumEnergyKernel, randomKernel;
......
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Portions copyright (c) 2009-2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -32,32 +32,34 @@
using namespace OpenMM;
using namespace std;
OpenCLParameterSet::OpenCLParameterSet(OpenCLContext& context, int numParameters, int numObjects, const string& name, bool bufferPerParameter) :
OpenCLParameterSet::OpenCLParameterSet(OpenCLContext& context, int numParameters, int numObjects, const string& name, bool bufferPerParameter, bool useDoublePrecision) :
context(context), numParameters(numParameters), numObjects(numObjects), name(name) {
int params = numParameters;
int bufferCount = 0;
elementSize = (useDoublePrecision ? sizeof(double) : sizeof(float));
string elementType = (useDoublePrecision ? "double" : "float");
try {
if (!bufferPerParameter) {
while (params > 2) {
cl::Buffer* buf = new cl::Buffer(context.getContext(), CL_MEM_READ_WRITE, numObjects*sizeof(mm_float4));
cl::Buffer* buf = new cl::Buffer(context.getContext(), CL_MEM_READ_WRITE, numObjects*elementSize*4);
std::stringstream name;
name << "param" << (++bufferCount);
buffers.push_back(OpenCLNonbondedUtilities::ParameterInfo(name.str(), "float", 4, sizeof(mm_float4), *buf));
buffers.push_back(OpenCLNonbondedUtilities::ParameterInfo(name.str(), elementType, 4, elementSize*4, *buf));
params -= 4;
}
if (params > 1) {
cl::Buffer* buf = new cl::Buffer(context.getContext(), CL_MEM_READ_WRITE, numObjects*sizeof(mm_float2));
cl::Buffer* buf = new cl::Buffer(context.getContext(), CL_MEM_READ_WRITE, numObjects*elementSize*2);
std::stringstream name;
name << "param" << (++bufferCount);
buffers.push_back(OpenCLNonbondedUtilities::ParameterInfo(name.str(), "float", 2, sizeof(mm_float2), *buf));
buffers.push_back(OpenCLNonbondedUtilities::ParameterInfo(name.str(), elementType, 2, elementSize*2, *buf));
params -= 2;
}
}
while (params > 0) {
cl::Buffer* buf = new cl::Buffer(context.getContext(), CL_MEM_READ_WRITE, numObjects*sizeof(cl_float));
cl::Buffer* buf = new cl::Buffer(context.getContext(), CL_MEM_READ_WRITE, numObjects*elementSize);
std::stringstream name;
name << "param" << (++bufferCount);
buffers.push_back(OpenCLNonbondedUtilities::ParameterInfo(name.str(), "float", 1, sizeof(cl_float), *buf));
buffers.push_back(OpenCLNonbondedUtilities::ParameterInfo(name.str(), elementType, 1, elementSize, *buf));
params--;
}
}
......@@ -73,39 +75,42 @@ OpenCLParameterSet::~OpenCLParameterSet() {
delete &buffers[i].getMemory();
}
void OpenCLParameterSet::getParameterValues(vector<vector<cl_float> >& values) const {
template <class T>
void OpenCLParameterSet::getParameterValues(vector<vector<T> >& values) const {
if (sizeof(T) != elementSize)
throw OpenMMException("Called getParameterValues() with vector of wrong type");
values.resize(numObjects);
for (int i = 0; i < numObjects; i++)
values[i].resize(numParameters);
try {
int base = 0;
for (int i = 0; i < (int) buffers.size(); i++) {
if (buffers[i].getType() == "float4") {
vector<mm_float4> data(numObjects);
if (buffers[i].getSize() == 4*elementSize) {
vector<T> data(4*numObjects);
context.getQueue().enqueueReadBuffer(reinterpret_cast<cl::Buffer&>(buffers[i].getMemory()), CL_TRUE, 0, numObjects*buffers[i].getSize(), &data[0]);
for (int j = 0; j < numObjects; j++) {
values[j][base] = data[j].x;
values[j][base] = data[4*j];
if (base+1 < numParameters)
values[j][base+1] = data[j].y;
values[j][base+1] = data[4*j+1];
if (base+2 < numParameters)
values[j][base+2] = data[j].z;
values[j][base+2] = data[4*j+2];
if (base+3 < numParameters)
values[j][base+3] = data[j].w;
values[j][base+3] = data[4*j+3];
}
base += 4;
}
else if (buffers[i].getType() == "float2") {
vector<mm_float2> data(numObjects);
else if (buffers[i].getSize() == 2*elementSize) {
vector<T> data(2*numObjects);
context.getQueue().enqueueReadBuffer(reinterpret_cast<cl::Buffer&>(buffers[i].getMemory()), CL_TRUE, 0, numObjects*buffers[i].getSize(), &data[0]);
for (int j = 0; j < numObjects; j++) {
values[j][base] = data[j].x;
values[j][base] = data[2*j];
if (base+1 < numParameters)
values[j][base+1] = data[j].y;
values[j][base+1] = data[2*j+1];
}
base += 2;
}
else if (buffers[i].getType() == "float") {
vector<cl_float> data(numObjects);
else if (buffers[i].getSize() == elementSize) {
vector<T> data(numObjects);
context.getQueue().enqueueReadBuffer(reinterpret_cast<cl::Buffer&>(buffers[i].getMemory()), CL_TRUE, 0, numObjects*buffers[i].getSize(), &data[0]);
for (int j = 0; j < numObjects; j++)
values[j][base] = data[j];
......@@ -122,36 +127,39 @@ void OpenCLParameterSet::getParameterValues(vector<vector<cl_float> >& values) c
}
}
void OpenCLParameterSet::setParameterValues(const vector<vector<cl_float> >& values) {
template <class T>
void OpenCLParameterSet::setParameterValues(const vector<vector<T> >& values) {
if (sizeof(T) != elementSize)
throw OpenMMException("Called setParameterValues() with vector of wrong type");
try {
int base = 0;
for (int i = 0; i < (int) buffers.size(); i++) {
if (buffers[i].getType() == "float4") {
vector<mm_float4> data(numObjects);
if (buffers[i].getSize() == 4*elementSize) {
vector<T> data(4*numObjects);
for (int j = 0; j < numObjects; j++) {
data[j].x = values[j][base];
data[4*j] = values[j][base];
if (base+1 < numParameters)
data[j].y = values[j][base+1];
data[4*j+1] = values[j][base+1];
if (base+2 < numParameters)
data[j].z = values[j][base+2];
data[4*j+2] = values[j][base+2];
if (base+3 < numParameters)
data[j].w = values[j][base+3];
data[4*j+3] = values[j][base+3];
}
context.getQueue().enqueueWriteBuffer(reinterpret_cast<cl::Buffer&>(buffers[i].getMemory()), CL_TRUE, 0, numObjects*buffers[i].getSize(), &data[0]);
base += 4;
}
else if (buffers[i].getType() == "float2") {
vector<mm_float2> data(numObjects);
else if (buffers[i].getSize() == 2*elementSize) {
vector<T> data(2*numObjects);
for (int j = 0; j < numObjects; j++) {
data[j].x = values[j][base];
data[2*j] = values[j][base];
if (base+1 < numParameters)
data[j].y = values[j][base+1];
data[2*j+1] = values[j][base+1];
}
context.getQueue().enqueueWriteBuffer(reinterpret_cast<cl::Buffer&>(buffers[i].getMemory()), CL_TRUE, 0, numObjects*buffers[i].getSize(), &data[0]);
base += 2;
}
else if (buffers[i].getType() == "float") {
vector<cl_float> data(numObjects);
else if (buffers[i].getSize() == elementSize) {
vector<T> data(numObjects);
for (int j = 0; j < numObjects; j++)
data[j] = values[j][base];
context.getQueue().enqueueWriteBuffer(reinterpret_cast<cl::Buffer&>(buffers[i].getMemory()), CL_TRUE, 0, numObjects*buffers[i].getSize(), &data[0]);
......@@ -172,16 +180,26 @@ string OpenCLParameterSet::getParameterSuffix(int index, const std::string& extr
const string suffixes[] = {".x", ".y", ".z", ".w"};
int buffer = -1;
for (int i = 0; buffer == -1 && i < (int) buffers.size(); i++) {
if (index*sizeof(cl_float) < buffers[i].getSize())
if (index*elementSize < buffers[i].getSize())
buffer = i;
else
index -= buffers[i].getSize()/sizeof(cl_float);
index -= buffers[i].getSize()/elementSize;
}
if (buffer == -1)
throw OpenMMException("Internal error: Illegal argument to OpenCLParameterSet::getParameterSuffix() ("+name+")");
stringstream suffix;
suffix << (buffer+1) << extraSuffix;
if (buffers[buffer].getType() != "float")
if (buffers[buffer].getSize() != elementSize)
suffix << suffixes[index];
return suffix.str();
}
/**
* Define template instantiations for float and double versions of getParameterValues() and setParameterValues().
*/
namespace OpenMM {
template void OpenCLParameterSet::getParameterValues<float>(vector<vector<float> >& values) const;
template void OpenCLParameterSet::setParameterValues<float>(const vector<vector<float> >& values);
template void OpenCLParameterSet::getParameterValues<double>(vector<vector<double> >& values) const;
template void OpenCLParameterSet::setParameterValues<double>(const vector<vector<double> >& values);
}
\ No newline at end of file
......@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Portions copyright (c) 2009-2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -51,8 +51,9 @@ public:
* @param name the name of the parameter set
* @param bufferPerParameter if true, a separate cl::Buffer is created for each parameter. If false,
* multiple parameters may be combined into a single buffer.
* @param useDoublePrecision whether values should be stored as single or double precision
*/
OpenCLParameterSet(OpenCLContext& context, int numParameters, int numObjects, const std::string& name, bool bufferPerParameter=false);
OpenCLParameterSet(OpenCLContext& context, int numParameters, int numObjects, const std::string& name, bool bufferPerParameter=false, bool useDoublePrecision=false);
~OpenCLParameterSet();
/**
* Get the number of parameters.
......@@ -71,13 +72,15 @@ public:
*
* @param values on exit, values[i][j] contains the value of parameter j for object i
*/
void getParameterValues(std::vector<std::vector<cl_float> >& values) const;
template <class T>
void getParameterValues(std::vector<std::vector<T> >& values) const;
/**
* Set the values of all parameters.
*
* @param values values[i][j] contains the value of parameter j for object i
*/
void setParameterValues(const std::vector<std::vector<cl_float> >& values);
template <class T>
void setParameterValues(const std::vector<std::vector<T> >& values);
/**
* Get a set of OpenCLNonbondedUtilities::ParameterInfo objects which describe the Buffers
* containing the data.
......@@ -95,8 +98,7 @@ public:
std::string getParameterSuffix(int index, const std::string& extraSuffix = "") const;
private:
OpenCLContext& context;
int numParameters;
int numObjects;
int numParameters, numObjects, elementSize;
std::string name;
std::vector<OpenCLNonbondedUtilities::ParameterInfo> buffers;
};
......
......@@ -76,8 +76,10 @@ OpenCLPlatform::OpenCLPlatform() {
registerKernelFactory(RemoveCMMotionKernel::Name(), factory);
platformProperties.push_back(OpenCLDeviceIndex());
platformProperties.push_back(OpenCLPlatformIndex());
platformProperties.push_back(OpenCLPrecision());
setPropertyDefaultValue(OpenCLDeviceIndex(), "");
setPropertyDefaultValue(OpenCLPlatformIndex(), "");
setPropertyDefaultValue(OpenCLPrecision(), "single");
}
bool OpenCLPlatform::supportsDoublePrecision() const {
......@@ -101,7 +103,9 @@ void OpenCLPlatform::contextCreated(ContextImpl& context, const map<string, stri
getPropertyDefaultValue(OpenCLPlatformIndex()) : properties.find(OpenCLPlatformIndex())->second);
const string& devicePropValue = (properties.find(OpenCLDeviceIndex()) == properties.end() ?
getPropertyDefaultValue(OpenCLDeviceIndex()) : properties.find(OpenCLDeviceIndex())->second);
context.setPlatformData(new PlatformData(context.getSystem(), platformPropValue, devicePropValue));
string precisionPropValue = (properties.find(OpenCLPrecision()) == properties.end() ?
getPropertyDefaultValue(OpenCLPrecision()) : properties.find(OpenCLPrecision())->second);
context.setPlatformData(new PlatformData(context.getSystem(), platformPropValue, devicePropValue, precisionPropValue));
}
void OpenCLPlatform::contextDestroyed(ContextImpl& context) const {
......@@ -109,7 +113,8 @@ void OpenCLPlatform::contextDestroyed(ContextImpl& context) const {
delete data;
}
OpenCLPlatform::PlatformData::PlatformData(const System& system, const string& platformPropValue, const string& deviceIndexProperty) : removeCM(false), stepCount(0), computeForceCount(0), time(0.0) {
OpenCLPlatform::PlatformData::PlatformData(const System& system, const string& platformPropValue, const string& deviceIndexProperty,
const string& precisionProperty) : removeCM(false), stepCount(0), computeForceCount(0), time(0.0) {
int platformIndex = 0;
if (platformPropValue.length() > 0)
stringstream(platformPropValue) >> platformIndex;
......@@ -124,11 +129,11 @@ OpenCLPlatform::PlatformData::PlatformData(const System& system, const string& p
if (devices[i].length() > 0) {
unsigned int deviceIndex;
stringstream(devices[i]) >> deviceIndex;
contexts.push_back(new OpenCLContext(system, platformIndex, deviceIndex, *this));
contexts.push_back(new OpenCLContext(system, platformIndex, deviceIndex, precisionProperty, *this));
}
}
if (contexts.size() == 0)
contexts.push_back(new OpenCLContext(system, platformIndex, -1, *this));
contexts.push_back(new OpenCLContext(system, platformIndex, -1, precisionProperty, *this));
stringstream device;
for (int i = 0; i < (int) contexts.size(); i++) {
if (i > 0)
......@@ -137,6 +142,7 @@ OpenCLPlatform::PlatformData::PlatformData(const System& system, const string& p
}
propertyValues[OpenCLPlatform::OpenCLDeviceIndex()] = device.str();
propertyValues[OpenCLPlatform::OpenCLPlatformIndex()] = OpenCLExpressionUtilities::intToString(platformIndex);
propertyValues[OpenCLPlatform::OpenCLPrecision()] = precisionProperty;
contextEnergy.resize(contexts.size());
}
......
......@@ -2,17 +2,19 @@
* Apply the Andersen thermostat to adjust particle velocities.
*/
__kernel void applyAndersenThermostat(float collisionFrequency, float kT, __global float4* velm, __global const float2* restrict stepSize, __global const float4* restrict random,
__kernel void applyAndersenThermostat(float collisionFrequency, float kT, __global mixed4* velm, __global const mixed2* restrict stepSize, __global const float4* restrict random,
unsigned int randomIndex, __global const int* restrict atomGroups) {
float collisionProbability = 1.0f-exp(-collisionFrequency*stepSize[0].y);
float randomRange = erf(collisionProbability/sqrt(2.0f));
for (int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) {
float4 velocity = velm[index];
mixed4 velocity = velm[index];
float4 selectRand = random[randomIndex+atomGroups[index]];
float4 velRand = random[randomIndex+index];
float scale = (selectRand.w > -randomRange && selectRand.w < randomRange ? 0.0f : 1.0f);
float add = (1.0f-scale)*sqrt(kT*velocity.w);
velocity.xyz = scale*velocity.xyz + add*velRand.xyz;
real scale = (selectRand.w > -randomRange && selectRand.w < randomRange ? 0 : 1);
real add = (1-scale)*sqrt(kT*velocity.w);
velocity.x = scale*velocity.x + add*velRand.x;
velocity.y = scale*velocity.y + add*velRand.y;
velocity.z = scale*velocity.z + add*velRand.z;
velm[index] = velocity;
}
}
......@@ -2,13 +2,16 @@
* Perform the first step of Brownian integration.
*/
__kernel void integrateBrownianPart1(float tauDeltaT, float noiseAmplitude, __global const float4* restrict force,
__global float4* restrict posDelta, __global const float4* restrict velm, __global const float4* restrict random, unsigned int randomIndex) {
__kernel void integrateBrownianPart1(mixed tauDeltaT, mixed noiseAmplitude, __global const real4* restrict force,
__global mixed4* restrict posDelta, __global const mixed4* restrict velm, __global const float4* restrict random, unsigned int randomIndex) {
randomIndex += get_global_id(0);
for (int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) {
float invMass = velm[index].w;
if (invMass != 0.0)
posDelta[index] = (float4) (tauDeltaT*invMass*force[index].xyz + noiseAmplitude*sqrt(invMass)*random[randomIndex].xyz, 0.0f);
mixed invMass = velm[index].w;
if (invMass != 0) {
posDelta[index] = (mixed4) (tauDeltaT*invMass*force[index].x + noiseAmplitude*sqrt(invMass)*random[randomIndex].x,
tauDeltaT*invMass*force[index].y + noiseAmplitude*sqrt(invMass)*random[randomIndex].y,
tauDeltaT*invMass*force[index].z + noiseAmplitude*sqrt(invMass)*random[randomIndex].z, 0);
}
randomIndex += get_global_size(0);
}
}
......@@ -17,12 +20,29 @@ __kernel void integrateBrownianPart1(float tauDeltaT, float noiseAmplitude, __gl
* Perform the second step of Brownian integration.
*/
__kernel void integrateBrownianPart2(float oneOverDeltaT, __global float4* posq, __global float4* velm, __global const float4* restrict posDelta) {
__kernel void integrateBrownianPart2(mixed oneOverDeltaT, __global real4* posq, __global real4* posqCorrection, __global mixed4* velm, __global const mixed4* restrict posDelta) {
for (int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) {
if (velm[index].w != 0.0) {
float4 delta = posDelta[index];
velm[index].xyz = oneOverDeltaT*delta.xyz;
posq[index].xyz = posq[index].xyz + delta.xyz;
if (velm[index].w != 0) {
mixed4 delta = posDelta[index];
velm[index].x = oneOverDeltaT*delta.x;
velm[index].y = oneOverDeltaT*delta.y;
velm[index].z = oneOverDeltaT*delta.z;
#ifdef USE_MIXED_PRECISION
real4 pos1 = posq[index];
real4 pos2 = posqCorrection[index];
mixed4 pos = (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
#else
real4 pos = posq[index];
#endif
pos.x += delta.x;
pos.y += delta.y;
pos.z += delta.z;
#ifdef USE_MIXED_PRECISION
posq[index] = (real4) ((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
posqCorrection[index] = (real4) (pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
#else
posq[index] = pos;
#endif
}
}
}
mixed4 loadPos(__global const real4* restrict posq, __global const real4* restrict posqCorrection, int index) {
#ifdef USE_MIXED_PRECISION
real4 pos1 = posq[index];
real4 pos2 = posqCorrection[index];
return (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
#else
return posq[index];
#endif
}
/**
* Compute the direction each constraint is pointing in. This is called once at the beginning of constraint evaluation.
*/
__kernel void computeConstraintDirections(__global const int2* restrict constraintAtoms, __global float4* restrict constraintDistance, __global const float4* restrict atomPositions) {
__kernel void computeConstraintDirections(__global const int2* restrict constraintAtoms, __global mixed4* restrict constraintDistance, __global const real4* restrict atomPositions, __global const real4* restrict posCorrection) {
for (int index = get_global_id(0); index < NUM_CONSTRAINTS; index += get_global_size(0)) {
// Compute the direction for this constraint.
int2 atoms = constraintAtoms[index];
float4 dir = constraintDistance[index];
float4 oldPos1 = atomPositions[atoms.x];
float4 oldPos2 = atomPositions[atoms.y];
mixed4 dir = constraintDistance[index];
mixed4 oldPos1 = loadPos(atomPositions, posCorrection, atoms.x);
mixed4 oldPos2 = loadPos(atomPositions, posCorrection, atoms.y);
dir.x = oldPos1.x-oldPos2.x;
dir.y = oldPos1.y-oldPos2.y;
dir.z = oldPos1.z-oldPos2.z;
......@@ -19,8 +28,8 @@ __kernel void computeConstraintDirections(__global const int2* restrict constrai
/**
* Compute the force applied by each constraint.
*/
__kernel void computeConstraintForce(__global const int2* restrict constraintAtoms, __global const float4* restrict constraintDistance, __global const float4* restrict atomPositions,
__global const float* restrict reducedMass, __global float* restrict delta1, __global int* restrict converged, float tol, int iteration) {
__kernel void computeConstraintForce(__global const int2* restrict constraintAtoms, __global const mixed4* restrict constraintDistance, __global const mixed4* restrict atomPositions,
__global const mixed* restrict reducedMass, __global mixed* restrict delta1, __global int* restrict converged, mixed tol, int iteration) {
__local int groupConverged;
if (converged[1-iteration%2]) {
if (get_global_id(0) == 0)
......@@ -30,21 +39,21 @@ __kernel void computeConstraintForce(__global const int2* restrict constraintAto
if (get_local_id(0) == 0)
groupConverged = 1;
barrier(CLK_LOCAL_MEM_FENCE);
float lowerTol = 1.0f-2.0f*tol+tol*tol;
float upperTol = 1.0f+2.0f*tol+tol*tol;
mixed lowerTol = 1-2*tol+tol*tol;
mixed upperTol = 1+2*tol+tol*tol;
for (int index = get_global_id(0); index < NUM_CONSTRAINTS; index += get_global_size(0)) {
// Compute the force due to this constraint.
int2 atoms = constraintAtoms[index];
float4 dir = constraintDistance[index];
float4 rp_ij = atomPositions[atoms.x]-atomPositions[atoms.y];
mixed4 dir = constraintDistance[index];
mixed4 rp_ij = atomPositions[atoms.x]-atomPositions[atoms.y];
#ifndef CONSTRAIN_VELOCITIES
rp_ij.xyz += dir.xyz;
#endif
float rrpr = rp_ij.x*dir.x + rp_ij.y*dir.y + rp_ij.z*dir.z;
float d_ij2 = dir.x*dir.x + dir.y*dir.y + dir.z*dir.z;
mixed rrpr = rp_ij.x*dir.x + rp_ij.y*dir.y + rp_ij.z*dir.z;
mixed d_ij2 = dir.x*dir.x + dir.y*dir.y + dir.z*dir.z;
#ifdef CONSTRAIN_VELOCITIES
delta1[index] = -2.0f*reducedMass[index]*rrpr/d_ij2;
delta1[index] = -2*reducedMass[index]*rrpr/d_ij2;
// See whether it has converged.
......@@ -53,9 +62,9 @@ __kernel void computeConstraintForce(__global const int2* restrict constraintAto
converged[iteration%2] = 0;
}
#else
float rp2 = rp_ij.x*rp_ij.x + rp_ij.y*rp_ij.y + rp_ij.z*rp_ij.z;
float dist2 = dir.w*dir.w;
float diff = dist2 - rp2;
mixed rp2 = rp_ij.x*rp_ij.x + rp_ij.y*rp_ij.y + rp_ij.z*rp_ij.z;
mixed dist2 = dir.w*dir.w;
mixed diff = dist2 - rp2;
delta1[index] = (rrpr > d_ij2*1e-6f ? reducedMass[index]*diff/rrpr : 0.0f);
// See whether it has converged.
......@@ -71,15 +80,15 @@ __kernel void computeConstraintForce(__global const int2* restrict constraintAto
/**
* Multiply the vector of constraint forces by the constraint matrix.
*/
__kernel void multiplyByConstraintMatrix(__global const float* restrict delta1, __global float* restrict delta2, __global const int* restrict constraintMatrixColumn,
__global const float* restrict constraintMatrixValue, __global const int* restrict converged, int iteration) {
__kernel void multiplyByConstraintMatrix(__global const mixed* restrict delta1, __global mixed* restrict delta2, __global const int* restrict constraintMatrixColumn,
__global const mixed* restrict constraintMatrixValue, __global const int* restrict converged, int iteration) {
if (converged[iteration%2])
return; // The constraint iteration has already converged.
// Multiply by the inverse constraint matrix.
for (int index = get_global_id(0); index < NUM_CONSTRAINTS; index += get_global_size(0)) {
float sum = 0.0f;
mixed sum = 0;
for (int i = 0; ; i++) {
int element = index+i*NUM_CONSTRAINTS;
int column = constraintMatrixColumn[element];
......@@ -94,26 +103,26 @@ __kernel void multiplyByConstraintMatrix(__global const float* restrict delta1,
/**
* Update the atom positions based on constraint forces.
*/
__kernel void updateAtomPositions(__global const int* restrict numAtomConstraints, __global const int* restrict atomConstraints, __global const float4* restrict constraintDistance,
__global float4* restrict atomPositions, __global const float4* restrict velm, __global const float* restrict delta1, __global const float* restrict delta2, __global int* restrict converged, int iteration) {
__kernel void updateAtomPositions(__global const int* restrict numAtomConstraints, __global const int* restrict atomConstraints, __global const mixed4* restrict constraintDistance,
__global mixed4* restrict atomPositions, __global const mixed4* restrict velm, __global const mixed* restrict delta1, __global const mixed* restrict delta2, __global int* restrict converged, int iteration) {
if (get_global_id(0) == 0)
converged[1-iteration%2] = 1;
if (converged[iteration%2])
return; // The constraint iteration has already converged.
float damping = (iteration < 2 ? 0.5f : 1.0f);
mixed damping = (iteration < 2 ? 0.5f : 1.0f);
for (int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) {
// Compute the new position of this atom.
float4 atomPos = atomPositions[index];
float invMass = velm[index].w;
mixed4 atomPos = atomPositions[index];
mixed invMass = velm[index].w;
int num = numAtomConstraints[index];
for (int i = 0; i < num; i++) {
int constraint = atomConstraints[index+i*NUM_ATOMS];
bool forward = (constraint > 0);
constraint = (forward ? constraint-1 : -constraint-1);
float constraintForce = damping*invMass*delta2[constraint];
mixed constraintForce = damping*invMass*delta2[constraint];
constraintForce = (forward ? constraintForce : -constraintForce);
float4 dir = constraintDistance[constraint];
mixed4 dir = constraintDistance[constraint];
atomPos.x += constraintForce*dir.x;
atomPos.y += constraintForce*dir.y;
atomPos.z += constraintForce*dir.z;
......
__kernel void applyPositionDeltas(__global float4* restrict posq, __global float4* restrict posDelta) {
__kernel void applyPositionDeltas(__global real4* restrict posq, __global real4* restrict posqCorrection, __global mixed4* restrict posDelta) {
for (unsigned int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) {
float4 position = posq[index];
position.xyz += posDelta[index].xyz;
posq[index] = position;
#ifdef USE_MIXED_PRECISION
real4 pos1 = posq[index];
real4 pos2 = posqCorrection[index];
mixed4 pos = (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
#else
mixed4 pos = posq[index];
#endif
pos.xyz += posDelta[index].xyz;
#ifdef USE_MIXED_PRECISION
posq[index] = (real4) ((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
posqCorrection[index] = (real4) (pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
#else
posq[index] = pos;
#endif
}
}
__kernel void computeSum(__global const float* restrict sumBuffer, __global float* result, unsigned int outputIndex, int bufferSize) {
__kernel void computeFloatSum(__global const float* restrict sumBuffer, __global float* result, unsigned int outputIndex, int bufferSize) {
__local float tempBuffer[WORK_GROUP_SIZE];
const unsigned int thread = get_local_id(0);
float sum = 0.0f;
float sum = 0;
for (unsigned int index = thread; index < bufferSize; index += get_local_size(0))
sum += sumBuffer[index];
tempBuffer[thread] = sum;
......@@ -14,12 +14,41 @@ __kernel void computeSum(__global const float* restrict sumBuffer, __global floa
result[outputIndex] = tempBuffer[0];
}
__kernel void applyPositionDeltas(__global float4* restrict posq, __global float4* restrict posDelta) {
#ifdef SUPPORTS_DOUBLE_PRECISION
__kernel void computeDoubleSum(__global const double* restrict sumBuffer, __global double* result, unsigned int outputIndex, int bufferSize) {
__local double tempBuffer[WORK_GROUP_SIZE];
const unsigned int thread = get_local_id(0);
double sum = 0;
for (unsigned int index = thread; index < bufferSize; index += get_local_size(0))
sum += sumBuffer[index];
tempBuffer[thread] = sum;
for (int i = 1; i < WORK_GROUP_SIZE; i *= 2) {
barrier(CLK_LOCAL_MEM_FENCE);
if (thread%(i*2) == 0 && thread+i < WORK_GROUP_SIZE)
tempBuffer[thread] += tempBuffer[thread+i];
}
if (thread == 0)
result[outputIndex] = tempBuffer[0];
}
#endif
__kernel void applyPositionDeltas(__global real4* restrict posq, __global real4* restrict posqCorrection, __global mixed4* restrict posDelta) {
for (unsigned int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) {
float4 position = posq[index];
position.xyz += posDelta[index].xyz;
posq[index] = position;
posDelta[index] = (float4) 0.0f;
#ifdef USE_MIXED_PRECISION
real4 pos1 = posq[index];
real4 pos2 = posqCorrection[index];
mixed4 pos = (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
#else
real4 pos = posq[index];
#endif
pos.xyz += posDelta[index].xyz;
#ifdef USE_MIXED_PRECISION
posq[index] = (real4) ((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
posqCorrection[index] = (real4) (pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
#else
posq[index] = pos;
#endif
posDelta[index] = (mixed4) 0;
}
}
......
__kernel void computeGlobal(__global float2* restrict dt, __global float* restrict globals, __global float* restrict params,
float uniform, float gaussian, __global const float* restrict energy) {
__kernel void computeGlobal(__global mixed2* restrict dt, __global mixed* restrict globals, __global mixed* restrict params,
float uniform, float gaussian, __global const real* restrict energy) {
COMPUTE_STEP
}
#ifdef SUPPORTS_DOUBLE_PRECISION
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
/**
* Load the position of a particle.
*/
mixed4 loadPos(__global const real4* restrict posq, __global const real4* restrict posqCorrection, int index) {
#ifdef USE_MIXED_PRECISION
real4 pos1 = posq[index];
real4 pos2 = posqCorrection[index];
return (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
#else
return posq[index];
#endif
}
/**
* Store the position of a particle.
*/
void storePos(__global real4* restrict posq, __global real4* restrict posqCorrection, int index, mixed4 pos) {
#ifdef USE_MIXED_PRECISION
posq[index] = (real4) ((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
posqCorrection[index] = (real4) (pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
#else
posq[index] = pos;
#endif
}
__kernel void computePerDof(__global float4* restrict posq, __global float4* restrict posDelta, __global float4* restrict velm,
__global const float4* restrict force, __global const float2* restrict dt, __global const float* restrict globals,
__global const float* restrict params, __global float* restrict sum, __global const float4* restrict gaussianValues,
unsigned int randomIndex, __global const float4* restrict uniformValues, __global const float* restrict energy
__kernel void computePerDof(__global real4* restrict posq, __global real4* restrict posqCorrection, __global mixed4* restrict posDelta,
__global mixed4* restrict velm, __global const real4* restrict force, __global const mixed2* restrict dt, __global const mixed* restrict globals,
__global const mixed* restrict params, __global mixed* restrict sum, __global const float4* restrict gaussianValues,
unsigned int randomIndex, __global const float4* restrict uniformValues, __global const real* restrict energy
PARAMETER_ARGUMENTS) {
float stepSize = dt[0].y;
mixed stepSize = dt[0].y;
int index = get_global_id(0);
randomIndex += index;
while (index < NUM_ATOMS) {
#ifdef SUPPORTS_DOUBLE_PRECISION
#ifdef LOAD_POS_AS_DELTA
double4 position = convert_double4(posq[index]+posDelta[index]);
mixed4 position = loadPos(posq, posqCorrection, index)+posDelta[index];
#else
double4 position = convert_double4(posq[index]);
#endif
double4 velocity = convert_double4(velm[index]);
double4 f = convert_double4(force[index]);
double mass = 1.0/velocity.w;
#else
#ifdef LOAD_POS_AS_DELTA
float4 position = posq[index]+posDelta[index];
#else
float4 position = posq[index];
#endif
float4 velocity = velm[index];
float4 f = force[index];
float mass = 1.0f/velocity.w;
mixed4 position = loadPos(posq, posqCorrection, index);
#endif
mixed4 velocity = velm[index];
real4 f = force[index];
mixed mass = 1/velocity.w;
if (velocity.w != 0.0) {
float4 gaussian = gaussianValues[randomIndex];
float4 uniform = uniformValues[index];
......
float2 multofFloat2(float2 a, float2 b) {
return (float2) (a.x*b.x - a.y*b.y, a.x*b.y + a.y*b.x);
}
......
#ifdef SUPPORTS_DOUBLE_PRECISION
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#endif
enum {VelScale, ForceScale, NoiseScale, MaxParams};
/**
* Perform the first step of Langevin integration.
*/
__kernel void integrateLangevinPart1(__global float4* restrict velm, __global const float4* restrict force, __global float4* restrict posDelta,
__global const float* restrict paramBuffer, __global const float2* restrict dt, __global const float4* restrict random, unsigned int randomIndex) {
float vscale = paramBuffer[VelScale];
float fscale = paramBuffer[ForceScale];
float noisescale = paramBuffer[NoiseScale];
float stepSize = dt[0].y;
__kernel void integrateLangevinPart1(__global mixed4* restrict velm, __global const real4* restrict force, __global mixed4* restrict posDelta,
__global const mixed* restrict paramBuffer, __global const mixed2* restrict dt, __global const float4* restrict random, unsigned int randomIndex) {
mixed vscale = paramBuffer[VelScale];
mixed fscale = paramBuffer[ForceScale];
mixed noisescale = paramBuffer[NoiseScale];
mixed stepSize = dt[0].y;
int index = get_global_id(0);
randomIndex += index;
while (index < NUM_ATOMS) {
float4 velocity = velm[index];
mixed4 velocity = velm[index];
if (velocity.w != 0.0) {
float sqrtInvMass = sqrt(velocity.w);
velocity.xyz = vscale*velocity.xyz + fscale*velocity.w*force[index].xyz + noisescale*sqrtInvMass*random[randomIndex].xyz;
mixed sqrtInvMass = sqrt(velocity.w);
velocity.x = vscale*velocity.x + fscale*velocity.w*force[index].x + noisescale*sqrtInvMass*random[randomIndex].x;
velocity.y = vscale*velocity.y + fscale*velocity.w*force[index].y + noisescale*sqrtInvMass*random[randomIndex].y;
velocity.z = vscale*velocity.z + fscale*velocity.w*force[index].z + noisescale*sqrtInvMass*random[randomIndex].z;
velm[index] = velocity;
posDelta[index] = stepSize*velocity;
}
......@@ -33,7 +31,7 @@ __kernel void integrateLangevinPart1(__global float4* restrict velm, __global co
* Perform the second step of Langevin integration.
*/
__kernel void integrateLangevinPart2(__global float4* restrict posq, __global const float4* restrict posDelta, __global float4* restrict velm, __global const float2* restrict dt) {
__kernel void integrateLangevinPart2(__global real4* restrict posq, __global real4* restrict posqCorrection, __global const mixed4* restrict posDelta, __global mixed4* restrict velm, __global const mixed2* restrict dt) {
#ifdef SUPPORTS_DOUBLE_PRECISION
double invStepSize = 1.0/dt[0].y;
#else
......@@ -41,17 +39,28 @@ __kernel void integrateLangevinPart2(__global float4* restrict posq, __global co
#endif
int index = get_global_id(0);
while (index < NUM_ATOMS) {
float4 vel = velm[index];
mixed4 vel = velm[index];
if (vel.w != 0.0) {
float4 pos = posq[index];
float4 delta = posDelta[index];
#ifdef USE_MIXED_PRECISION
real4 pos1 = posq[index];
real4 pos2 = posqCorrection[index];
mixed4 pos = (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
#else
real4 pos = posq[index];
#endif
mixed4 delta = posDelta[index];
pos.xyz += delta.xyz;
#ifdef SUPPORTS_DOUBLE_PRECISION
vel.xyz = convert_float4(invStepSize*convert_double4(delta)).xyz;
vel.xyz = convert_mixed4(invStepSize*convert_double4(delta)).xyz;
#else
vel.xyz = invStepSize*delta.xyz;
#endif
#ifdef USE_MIXED_PRECISION
posq[index] = convert_real4(pos);
posqCorrection[index] = (real4) (pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
#else
posq[index] = pos;
#endif
velm[index] = vel;
}
index += get_global_size(0);
......@@ -62,15 +71,15 @@ __kernel void integrateLangevinPart2(__global float4* restrict posq, __global co
* Select the step size to use for the next step.
*/
__kernel void selectLangevinStepSize(float maxStepSize, float errorTol, float tau, float kT, __global float2* restrict dt,
__global const float4* restrict velm, __global const float4* restrict force, __global float* restrict paramBuffer, __local float* restrict params, __local float* restrict error) {
__kernel void selectLangevinStepSize(mixed maxStepSize, mixed errorTol, mixed tau, mixed kT, __global mixed2* restrict dt,
__global const mixed4* restrict velm, __global const real4* restrict force, __global mixed* restrict paramBuffer, __local mixed* restrict params, __local mixed* restrict error) {
// Calculate the error.
float err = 0.0f;
mixed err = 0.0f;
unsigned int index = get_local_id(0);
while (index < NUM_ATOMS) {
float4 f = force[index];
float invMass = velm[index].w;
real4 f = force[index];
mixed invMass = velm[index].w;
err += (f.x*f.x + f.y*f.y + f.z*f.z)*invMass;
index += get_global_size(0);
}
......@@ -87,9 +96,9 @@ __kernel void selectLangevinStepSize(float maxStepSize, float errorTol, float ta
if (get_global_id(0) == 0) {
// Select the new step size.
float totalError = sqrt(error[0]/(NUM_ATOMS*3));
float newStepSize = sqrt(errorTol/totalError);
float oldStepSize = dt[0].y;
mixed totalError = sqrt(error[0]/(NUM_ATOMS*3));
mixed newStepSize = sqrt(errorTol/totalError);
mixed oldStepSize = dt[0].y;
if (oldStepSize > 0.0f)
newStepSize = min(newStepSize, oldStepSize*2.0f); // For safety, limit how quickly dt can increase.
if (newStepSize > oldStepSize && newStepSize < 1.1f*oldStepSize)
......@@ -100,9 +109,9 @@ __kernel void selectLangevinStepSize(float maxStepSize, float errorTol, float ta
// Recalculate the integration parameters.
float vscale = exp(-newStepSize/tau);
float fscale = (1-vscale)*tau;
float noisescale = sqrt(2*kT/tau)*sqrt(0.5f*(1-vscale*vscale)*tau);
mixed vscale = exp(-newStepSize/tau);
mixed fscale = (1-vscale)*tau;
mixed noisescale = sqrt(2*kT/tau)*sqrt(0.5f*(1-vscale*vscale)*tau);
params[VelScale] = vscale;
params[ForceScale] = fscale;
params[NoiseScale] = noisescale;
......
......@@ -2,13 +2,16 @@
* Calculate the center of mass momentum.
*/
__kernel void calcCenterOfMassMomentum(int numAtoms, __global const float4* restrict velm, __global float4* restrict cmMomentum, __local volatile float4* restrict temp) {
__kernel void calcCenterOfMassMomentum(int numAtoms, __global const mixed4* restrict velm, __global float4* restrict cmMomentum, __local volatile float4* restrict temp) {
int index = get_global_id(0);
float4 cm = 0.0f;
while (index < numAtoms) {
float4 velocity = velm[index];
if (velocity.w != 0.0)
cm.xyz += velocity.xyz/velocity.w;
mixed4 velocity = velm[index];
if (velocity.w != 0) {
cm.x += velocity.x/velocity.w;
cm.y += velocity.y/velocity.w;
cm.z += velocity.z/velocity.w;
}
index += get_global_size(0);
}
......@@ -54,7 +57,7 @@ __kernel void calcCenterOfMassMomentum(int numAtoms, __global const float4* rest
* Remove center of mass motion.
*/
__kernel void removeCenterOfMassMomentum(unsigned int numAtoms, __global float4* restrict velm, __global const float4* restrict cmMomentum, __local volatile float4* restrict temp) {
__kernel void removeCenterOfMassMomentum(unsigned int numAtoms, __global mixed4* restrict velm, __global const float4* restrict cmMomentum, __local volatile float4* restrict temp) {
// First sum all of the momenta that were calculated by individual groups.
unsigned int index = get_local_id(0);
......@@ -101,7 +104,9 @@ __kernel void removeCenterOfMassMomentum(unsigned int numAtoms, __global float4*
index = get_global_id(0);
while (index < numAtoms) {
velm[index].xyz -= cm.xyz;
velm[index].x -= cm.x;
velm[index].y -= cm.y;
velm[index].z -= cm.z;
index += get_global_size(0);
}
}
mixed4 loadPos(__global const real4* restrict posq, __global const real4* restrict posqCorrection, int index) {
#ifdef USE_MIXED_PRECISION
real4 pos1 = posq[index];
real4 pos2 = posqCorrection[index];
return (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
#else
return posq[index];
#endif
}
/**
* Enforce constraints on SETTLE clusters
*/
__kernel void applySettle(int numClusters, float tol, __global const float4* restrict oldPos, __global float4* restrict posDelta, __global const float4* restrict velm, __global const int4* restrict clusterAtoms, __global const float2* restrict clusterParams) {
__kernel void applySettle(int numClusters, mixed tol, __global const real4* restrict oldPos, __global const real4* restrict posCorrection, __global mixed4* restrict posDelta, __global const mixed4* restrict velm, __global const int4* restrict clusterAtoms, __global const float2* restrict clusterParams) {
int index = get_global_id(0);
while (index < numClusters) {
// Load the data for this cluster.
int4 atoms = clusterAtoms[index];
float2 params = clusterParams[index];
float4 apos0 = oldPos[atoms.x];
float4 xp0 = posDelta[atoms.x];
float4 apos1 = oldPos[atoms.y];
float4 xp1 = posDelta[atoms.y];
float4 apos2 = oldPos[atoms.z];
float4 xp2 = posDelta[atoms.z];
float m0 = RECIP(velm[atoms.x].w);
float m1 = RECIP(velm[atoms.y].w);
float m2 = RECIP(velm[atoms.z].w);
mixed4 apos0 = loadPos(oldPos, posCorrection, atoms.x);
mixed4 xp0 = posDelta[atoms.x];
mixed4 apos1 = loadPos(oldPos, posCorrection, atoms.y);
mixed4 xp1 = posDelta[atoms.y];
mixed4 apos2 = loadPos(oldPos, posCorrection, atoms.z);
mixed4 xp2 = posDelta[atoms.z];
mixed m0 = 1/velm[atoms.x].w;
mixed m1 = 1/velm[atoms.y].w;
mixed m2 = 1/velm[atoms.z].w;
// Apply the SETTLE algorithm.
float xb0 = apos1.x-apos0.x;
float yb0 = apos1.y-apos0.y;
float zb0 = apos1.z-apos0.z;
float xc0 = apos2.x-apos0.x;
float yc0 = apos2.y-apos0.y;
float zc0 = apos2.z-apos0.z;
float invTotalMass = 1.0f/(m0+m1+m2);
float xcom = (xp0.x*m0 + (xb0+xp1.x)*m1 + (xc0+xp2.x)*m2) * invTotalMass;
float ycom = (xp0.y*m0 + (yb0+xp1.y)*m1 + (yc0+xp2.y)*m2) * invTotalMass;
float zcom = (xp0.z*m0 + (zb0+xp1.z)*m1 + (zc0+xp2.z)*m2) * invTotalMass;
float xa1 = xp0.x - xcom;
float ya1 = xp0.y - ycom;
float za1 = xp0.z - zcom;
float xb1 = xb0 + xp1.x - xcom;
float yb1 = yb0 + xp1.y - ycom;
float zb1 = zb0 + xp1.z - zcom;
float xc1 = xc0 + xp2.x - xcom;
float yc1 = yc0 + xp2.y - ycom;
float zc1 = zc0 + xp2.z - zcom;
float xaksZd = yb0*zc0 - zb0*yc0;
float yaksZd = zb0*xc0 - xb0*zc0;
float zaksZd = xb0*yc0 - yb0*xc0;
float xaksXd = ya1*zaksZd - za1*yaksZd;
float yaksXd = za1*xaksZd - xa1*zaksZd;
float zaksXd = xa1*yaksZd - ya1*xaksZd;
float xaksYd = yaksZd*zaksXd - zaksZd*yaksXd;
float yaksYd = zaksZd*xaksXd - xaksZd*zaksXd;
float zaksYd = xaksZd*yaksXd - yaksZd*xaksXd;
float axlng = sqrt(xaksXd*xaksXd + yaksXd*yaksXd + zaksXd*zaksXd);
float aylng = sqrt(xaksYd*xaksYd + yaksYd*yaksYd + zaksYd*zaksYd);
float azlng = sqrt(xaksZd*xaksZd + yaksZd*yaksZd + zaksZd*zaksZd);
float trns11 = xaksXd / axlng;
float trns21 = yaksXd / axlng;
float trns31 = zaksXd / axlng;
float trns12 = xaksYd / aylng;
float trns22 = yaksYd / aylng;
float trns32 = zaksYd / aylng;
float trns13 = xaksZd / azlng;
float trns23 = yaksZd / azlng;
float trns33 = zaksZd / azlng;
float xb0d = trns11*xb0 + trns21*yb0 + trns31*zb0;
float yb0d = trns12*xb0 + trns22*yb0 + trns32*zb0;
float xc0d = trns11*xc0 + trns21*yc0 + trns31*zc0;
float yc0d = trns12*xc0 + trns22*yc0 + trns32*zc0;
float za1d = trns13*xa1 + trns23*ya1 + trns33*za1;
float xb1d = trns11*xb1 + trns21*yb1 + trns31*zb1;
float yb1d = trns12*xb1 + trns22*yb1 + trns32*zb1;
float zb1d = trns13*xb1 + trns23*yb1 + trns33*zb1;
float xc1d = trns11*xc1 + trns21*yc1 + trns31*zc1;
float yc1d = trns12*xc1 + trns22*yc1 + trns32*zc1;
float zc1d = trns13*xc1 + trns23*yc1 + trns33*zc1;
mixed xb0 = apos1.x-apos0.x;
mixed yb0 = apos1.y-apos0.y;
mixed zb0 = apos1.z-apos0.z;
mixed xc0 = apos2.x-apos0.x;
mixed yc0 = apos2.y-apos0.y;
mixed zc0 = apos2.z-apos0.z;
mixed invTotalMass = 1.0f/(m0+m1+m2);
mixed xcom = (xp0.x*m0 + (xb0+xp1.x)*m1 + (xc0+xp2.x)*m2) * invTotalMass;
mixed ycom = (xp0.y*m0 + (yb0+xp1.y)*m1 + (yc0+xp2.y)*m2) * invTotalMass;
mixed zcom = (xp0.z*m0 + (zb0+xp1.z)*m1 + (zc0+xp2.z)*m2) * invTotalMass;
mixed xa1 = xp0.x - xcom;
mixed ya1 = xp0.y - ycom;
mixed za1 = xp0.z - zcom;
mixed xb1 = xb0 + xp1.x - xcom;
mixed yb1 = yb0 + xp1.y - ycom;
mixed zb1 = zb0 + xp1.z - zcom;
mixed xc1 = xc0 + xp2.x - xcom;
mixed yc1 = yc0 + xp2.y - ycom;
mixed zc1 = zc0 + xp2.z - zcom;
mixed xaksZd = yb0*zc0 - zb0*yc0;
mixed yaksZd = zb0*xc0 - xb0*zc0;
mixed zaksZd = xb0*yc0 - yb0*xc0;
mixed xaksXd = ya1*zaksZd - za1*yaksZd;
mixed yaksXd = za1*xaksZd - xa1*zaksZd;
mixed zaksXd = xa1*yaksZd - ya1*xaksZd;
mixed xaksYd = yaksZd*zaksXd - zaksZd*yaksXd;
mixed yaksYd = zaksZd*xaksXd - xaksZd*zaksXd;
mixed zaksYd = xaksZd*yaksXd - yaksZd*xaksXd;
mixed axlng = sqrt(xaksXd*xaksXd + yaksXd*yaksXd + zaksXd*zaksXd);
mixed aylng = sqrt(xaksYd*xaksYd + yaksYd*yaksYd + zaksYd*zaksYd);
mixed azlng = sqrt(xaksZd*xaksZd + yaksZd*yaksZd + zaksZd*zaksZd);
mixed trns11 = xaksXd / axlng;
mixed trns21 = yaksXd / axlng;
mixed trns31 = zaksXd / axlng;
mixed trns12 = xaksYd / aylng;
mixed trns22 = yaksYd / aylng;
mixed trns32 = zaksYd / aylng;
mixed trns13 = xaksZd / azlng;
mixed trns23 = yaksZd / azlng;
mixed trns33 = zaksZd / azlng;
mixed xb0d = trns11*xb0 + trns21*yb0 + trns31*zb0;
mixed yb0d = trns12*xb0 + trns22*yb0 + trns32*zb0;
mixed xc0d = trns11*xc0 + trns21*yc0 + trns31*zc0;
mixed yc0d = trns12*xc0 + trns22*yc0 + trns32*zc0;
mixed za1d = trns13*xa1 + trns23*ya1 + trns33*za1;
mixed xb1d = trns11*xb1 + trns21*yb1 + trns31*zb1;
mixed yb1d = trns12*xb1 + trns22*yb1 + trns32*zb1;
mixed zb1d = trns13*xb1 + trns23*yb1 + trns33*zb1;
mixed xc1d = trns11*xc1 + trns21*yc1 + trns31*zc1;
mixed yc1d = trns12*xc1 + trns22*yc1 + trns32*zc1;
mixed zc1d = trns13*xc1 + trns23*yc1 + trns33*zc1;
// --- Step2 A2' ---
float rc = 0.5*params.y;
float rb = sqrt(params.x*params.x-rc*rc);
float ra = rb*(m1+m2)*invTotalMass;
mixed rb = sqrt(params.x*params.x-rc*rc);
mixed ra = rb*(m1+m2)*invTotalMass;
rb -= ra;
float sinphi = za1d / ra;
float cosphi = sqrt(1.0f - sinphi*sinphi);
float sinpsi = (zb1d - zc1d) / (2*rc*cosphi);
float cospsi = sqrt(1.0f - sinpsi*sinpsi);
float ya2d = ra*cosphi;
float xb2d = - rc*cospsi;
float yb2d = - rb*cosphi - rc*sinpsi*sinphi;
float yc2d = - rb*cosphi + rc*sinpsi*sinphi;
float xb2d2 = xb2d*xb2d;
float hh2 = 4.0f*xb2d2 + (yb2d-yc2d)*(yb2d-yc2d) + (zb1d-zc1d)*(zb1d-zc1d);
float deltx = 2.0f*xb2d + sqrt(4.0f*xb2d2 - hh2 + params.y*params.y);
mixed sinphi = za1d / ra;
mixed cosphi = sqrt(1.0f - sinphi*sinphi);
mixed sinpsi = (zb1d - zc1d) / (2*rc*cosphi);
mixed cospsi = sqrt(1.0f - sinpsi*sinpsi);
mixed ya2d = ra*cosphi;
mixed xb2d = - rc*cospsi;
mixed yb2d = - rb*cosphi - rc*sinpsi*sinphi;
mixed yc2d = - rb*cosphi + rc*sinpsi*sinphi;
mixed xb2d2 = xb2d*xb2d;
mixed hh2 = 4.0f*xb2d2 + (yb2d-yc2d)*(yb2d-yc2d) + (zb1d-zc1d)*(zb1d-zc1d);
mixed deltx = 2.0f*xb2d + sqrt(4.0f*xb2d2 - hh2 + params.y*params.y);
xb2d -= deltx*0.5;
// --- Step3 al,be,ga ---
float alpha = (xb2d*(xb0d-xc0d) + yb0d*yb2d + yc0d*yc2d);
float beta = (xb2d*(yc0d-yb0d) + xb0d*yb2d + xc0d*yc2d);
float gamma = xb0d*yb1d - xb1d*yb0d + xc0d*yc1d - xc1d*yc0d;
mixed alpha = (xb2d*(xb0d-xc0d) + yb0d*yb2d + yc0d*yc2d);
mixed beta = (xb2d*(yc0d-yb0d) + xb0d*yb2d + xc0d*yc2d);
mixed gamma = xb0d*yb1d - xb1d*yb0d + xc0d*yc1d - xc1d*yc0d;
float al2be2 = alpha*alpha + beta*beta;
float sintheta = (alpha*gamma - beta*sqrt(al2be2 - gamma*gamma)) / al2be2;
mixed al2be2 = alpha*alpha + beta*beta;
mixed sintheta = (alpha*gamma - beta*sqrt(al2be2 - gamma*gamma)) / al2be2;
// --- Step4 A3' ---
float costheta = sqrt(1.0f - sintheta*sintheta);
float xa3d = - ya2d*sintheta;
float ya3d = ya2d*costheta;
float za3d = za1d;
float xb3d = xb2d*costheta - yb2d*sintheta;
float yb3d = xb2d*sintheta + yb2d*costheta;
float zb3d = zb1d;
float xc3d = - xb2d*costheta - yc2d*sintheta;
float yc3d = - xb2d*sintheta + yc2d*costheta;
float zc3d = zc1d;
mixed costheta = sqrt(1.0f - sintheta*sintheta);
mixed xa3d = - ya2d*sintheta;
mixed ya3d = ya2d*costheta;
mixed za3d = za1d;
mixed xb3d = xb2d*costheta - yb2d*sintheta;
mixed yb3d = xb2d*sintheta + yb2d*costheta;
mixed zb3d = zb1d;
mixed xc3d = - xb2d*costheta - yc2d*sintheta;
mixed yc3d = - xb2d*sintheta + yc2d*costheta;
mixed zc3d = zc1d;
// --- Step5 A3 ---
float xa3 = trns11*xa3d + trns12*ya3d + trns13*za3d;
float ya3 = trns21*xa3d + trns22*ya3d + trns23*za3d;
float za3 = trns31*xa3d + trns32*ya3d + trns33*za3d;
float xb3 = trns11*xb3d + trns12*yb3d + trns13*zb3d;
float yb3 = trns21*xb3d + trns22*yb3d + trns23*zb3d;
float zb3 = trns31*xb3d + trns32*yb3d + trns33*zb3d;
float xc3 = trns11*xc3d + trns12*yc3d + trns13*zc3d;
float yc3 = trns21*xc3d + trns22*yc3d + trns23*zc3d;
float zc3 = trns31*xc3d + trns32*yc3d + trns33*zc3d;
mixed xa3 = trns11*xa3d + trns12*ya3d + trns13*za3d;
mixed ya3 = trns21*xa3d + trns22*ya3d + trns23*za3d;
mixed za3 = trns31*xa3d + trns32*ya3d + trns33*za3d;
mixed xb3 = trns11*xb3d + trns12*yb3d + trns13*zb3d;
mixed yb3 = trns21*xb3d + trns22*yb3d + trns23*zb3d;
mixed zb3 = trns31*xb3d + trns32*yb3d + trns33*zb3d;
mixed xc3 = trns11*xc3d + trns12*yc3d + trns13*zc3d;
mixed yc3 = trns21*xc3d + trns22*yc3d + trns23*zc3d;
mixed zc3 = trns31*xc3d + trns32*yc3d + trns33*zc3d;
xp0.x = xcom + xa3;
xp0.y = ycom + ya3;
......@@ -155,49 +165,49 @@ __kernel void applySettle(int numClusters, float tol, __global const float4* res
* Enforce velocity constraints on SETTLE clusters
*/
__kernel void constrainVelocities(int numClusters, float tol, __global const float4* restrict oldPos, __global float4* restrict posDelta, __global float4* restrict velm, __global const int4* restrict clusterAtoms, __global const float2* restrict clusterParams) {
__kernel void constrainVelocities(int numClusters, mixed tol, __global const real4* restrict oldPos, __global const real4* restrict posCorrection, __global mixed4* restrict posDelta, __global mixed4* restrict velm, __global const int4* restrict clusterAtoms, __global const float2* restrict clusterParams) {
for (int index = get_global_id(0); index < numClusters; index += get_global_size(0)) {
// Load the data for this cluster.
int4 atoms = clusterAtoms[index];
float4 apos0 = oldPos[atoms.x];
float4 apos1 = oldPos[atoms.y];
float4 apos2 = oldPos[atoms.z];
float4 v0 = velm[atoms.x];
float4 v1 = velm[atoms.y];
float4 v2 = velm[atoms.z];
mixed4 apos0 = loadPos(oldPos, posCorrection, atoms.x);
mixed4 apos1 = loadPos(oldPos, posCorrection, atoms.y);
mixed4 apos2 = loadPos(oldPos, posCorrection, atoms.z);
mixed4 v0 = velm[atoms.x];
mixed4 v1 = velm[atoms.y];
mixed4 v2 = velm[atoms.z];
// Compute intermediate quantities: the atom masses, the bond directions, the relative velocities,
// and the angle cosines and sines.
float mA = RECIP(v0.w);
float mB = RECIP(v1.w);
float mC = RECIP(v2.w);
float4 eAB = apos1-apos0;
float4 eBC = apos2-apos1;
float4 eCA = apos0-apos2;
eAB.xyz /= SQRT(eAB.x*eAB.x + eAB.y*eAB.y + eAB.z*eAB.z);
eBC.xyz /= SQRT(eBC.x*eBC.x + eBC.y*eBC.y + eBC.z*eBC.z);
eCA.xyz /= SQRT(eCA.x*eCA.x + eCA.y*eCA.y + eCA.z*eCA.z);
float vAB = (v1.x-v0.x)*eAB.x + (v1.y-v0.y)*eAB.y + (v1.z-v0.z)*eAB.z;
float vBC = (v2.x-v1.x)*eBC.x + (v2.y-v1.y)*eBC.y + (v2.z-v1.z)*eBC.z;
float vCA = (v0.x-v2.x)*eCA.x + (v0.y-v2.y)*eCA.y + (v0.z-v2.z)*eCA.z;
float cA = -(eAB.x*eCA.x + eAB.y*eCA.y + eAB.z*eCA.z);
float cB = -(eAB.x*eBC.x + eAB.y*eBC.y + eAB.z*eBC.z);
float cC = -(eBC.x*eCA.x + eBC.y*eCA.y + eBC.z*eCA.z);
float s2A = 1-cA*cA;
float s2B = 1-cB*cB;
float s2C = 1-cC*cC;
mixed mA = 1/v0.w;
mixed mB = 1/v1.w;
mixed mC = 1/v2.w;
mixed4 eAB = apos1-apos0;
mixed4 eBC = apos2-apos1;
mixed4 eCA = apos0-apos2;
eAB.xyz /= sqrt(eAB.x*eAB.x + eAB.y*eAB.y + eAB.z*eAB.z);
eBC.xyz /= sqrt(eBC.x*eBC.x + eBC.y*eBC.y + eBC.z*eBC.z);
eCA.xyz /= sqrt(eCA.x*eCA.x + eCA.y*eCA.y + eCA.z*eCA.z);
mixed vAB = (v1.x-v0.x)*eAB.x + (v1.y-v0.y)*eAB.y + (v1.z-v0.z)*eAB.z;
mixed vBC = (v2.x-v1.x)*eBC.x + (v2.y-v1.y)*eBC.y + (v2.z-v1.z)*eBC.z;
mixed vCA = (v0.x-v2.x)*eCA.x + (v0.y-v2.y)*eCA.y + (v0.z-v2.z)*eCA.z;
mixed cA = -(eAB.x*eCA.x + eAB.y*eCA.y + eAB.z*eCA.z);
mixed cB = -(eAB.x*eBC.x + eAB.y*eBC.y + eAB.z*eBC.z);
mixed cC = -(eBC.x*eCA.x + eBC.y*eCA.y + eBC.z*eCA.z);
mixed s2A = 1-cA*cA;
mixed s2B = 1-cB*cB;
mixed s2C = 1-cC*cC;
// Solve the equations. These are different from those in the SETTLE paper (JCC 13(8), pp. 952-962, 1992), because
// in going from equations B1 to B2, they make the assumption that mB=mC (but don't bother to mention they're
// making that assumption). We allow all three atoms to have different masses.
float mABCinv = RECIP(mA*mB*mC);
float denom = (((s2A*mB+s2B*mA)*mC+(s2A*mB*mB+2*(cA*cB*cC+1)*mA*mB+s2B*mA*mA))*mC+s2C*mA*mB*(mA+mB))*mABCinv;
float tab = ((cB*cC*mA-cA*mB-cA*mC)*vCA + (cA*cC*mB-cB*mC-cB*mA)*vBC + (s2C*mA*mA*mB*mB*mABCinv+(mA+mB+mC))*vAB)/denom;
float tbc = ((cA*cB*mC-cC*mB-cC*mA)*vCA + (s2A*mB*mB*mC*mC*mABCinv+(mA+mB+mC))*vBC + (cA*cC*mB-cB*mA-cB*mC)*vAB)/denom;
float tca = ((s2B*mA*mA*mC*mC*mABCinv+(mA+mB+mC))*vCA + (cA*cB*mC-cC*mB-cC*mA)*vBC + (cB*cC*mA-cA*mB-cA*mC)*vAB)/denom;
mixed mABCinv = 1/(mA*mB*mC);
mixed denom = (((s2A*mB+s2B*mA)*mC+(s2A*mB*mB+2*(cA*cB*cC+1)*mA*mB+s2B*mA*mA))*mC+s2C*mA*mB*(mA+mB))*mABCinv;
mixed tab = ((cB*cC*mA-cA*mB-cA*mC)*vCA + (cA*cC*mB-cB*mC-cB*mA)*vBC + (s2C*mA*mA*mB*mB*mABCinv+(mA+mB+mC))*vAB)/denom;
mixed tbc = ((cA*cB*mC-cC*mB-cC*mA)*vCA + (s2A*mB*mB*mC*mC*mABCinv+(mA+mB+mC))*vBC + (cA*cC*mB-cB*mA-cB*mC)*vAB)/denom;
mixed tca = ((s2B*mA*mA*mC*mC*mABCinv+(mA+mB+mC))*vCA + (cA*cB*mC-cC*mB-cC*mA)*vBC + (cB*cC*mA-cA*mB-cA*mC)*vAB)/denom;
v0.xyz += (tab*eAB.xyz - tca*eCA.xyz)*v0.w;
v1.xyz += (tbc*eBC.xyz - tab*eAB.xyz)*v1.w;
v2.xyz += (tca*eCA.xyz - tbc*eBC.xyz)*v2.w;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment