Commit 8d6a2a01 authored by Peter Eastman's avatar Peter Eastman
Browse files

Beginnings of mixed/double precision support in OpenCL

parent a3d5f834
......@@ -68,11 +68,18 @@ public:
static const std::string key = "OpenCLPlatformIndex";
return key;
}
/**
* This is the name of the parameter for selecting what numerical precision to use.
*/
static const std::string& OpenCLPrecision() {
static const std::string key = "OpenCLPrecision";
return key;
}
};
class OPENMM_EXPORT OpenCLPlatform::PlatformData {
public:
PlatformData(const System& system, const std::string& platformPropValue, const std::string& deviceIndexProperty);
PlatformData(const System& system, const std::string& platformPropValue, const std::string& deviceIndexProperty, const std::string& precisionProperty);
~PlatformData();
void initializeContexts(const System& system);
void syncContexts();
......
......@@ -65,10 +65,24 @@ static void CL_CALLBACK errorCallback(const char* errinfo, const void* private_i
std::cerr << "OpenCL internal error: " << errinfo << std::endl;
}
OpenCLContext::OpenCLContext(const System& system, int platformIndex, int deviceIndex, OpenCLPlatform::PlatformData& platformData) :
OpenCLContext::OpenCLContext(const System& system, int platformIndex, int deviceIndex, const string& precision, OpenCLPlatform::PlatformData& platformData) :
system(system), time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), atomsWereReordered(false), posq(NULL),
velm(NULL), forceBuffers(NULL), longForceBuffer(NULL), energyBuffer(NULL), atomIndexDevice(NULL), integration(NULL),
bonded(NULL), nonbonded(NULL), thread(NULL) {
if (precision == "single") {
useDoublePrecision = false;
useMixedPrecision = false;
}
else if (precision == "mixed") {
useDoublePrecision = false;
useMixedPrecision = true;
}
else if (precision == "double") {
useDoublePrecision = true;
useMixedPrecision = false;
}
else
throw OpenMMException("Illegal value for OpenCLPrecision: "+precision);
try {
contextIndex = platformData.contexts.size();
std::vector<cl::Platform> platforms;
......@@ -217,8 +231,27 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
numThreadBlocks = numThreadBlocksPerComputeUnit*device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
bonded = new OpenCLBondedUtilities(*this);
nonbonded = new OpenCLNonbondedUtilities(*this);
if (useDoublePrecision) {
posq = OpenCLArray::create<mm_double4>(*this, paddedNumAtoms, "posq");
velm = OpenCLArray::create<mm_double4>(*this, paddedNumAtoms, "velm");
compilationDefines["USE_DOUBLE_PRECISION"] = "1";
compilationDefines["convert_real4"] = "convert_double4";
compilationDefines["convert_mixed4"] = "convert_double4";
}
else if (useMixedPrecision) {
posq = OpenCLArray::create<mm_float4>(*this, paddedNumAtoms, "posq");
posqCorrection = OpenCLArray::create<mm_float4>(*this, paddedNumAtoms, "posq");
velm = OpenCLArray::create<mm_double4>(*this, paddedNumAtoms, "velm");
compilationDefines["USE_MIXED_PRECISION"] = "1";
compilationDefines["convert_real4"] = "convert_float4";
compilationDefines["convert_mixed4"] = "convert_double4";
}
else {
posq = OpenCLArray::create<mm_float4>(*this, paddedNumAtoms, "posq");
velm = OpenCLArray::create<mm_float4>(*this, paddedNumAtoms, "velm");
compilationDefines["convert_real4"] = "convert_float4";
compilationDefines["convert_mixed4"] = "convert_float4";
}
posCellOffsets.resize(paddedNumAtoms, mm_int4(0, 0, 0, 0));
}
catch (cl::Error err) {
......@@ -241,6 +274,7 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
// Decide whether native_sqrt(), native_rsqrt(), and native_recip() are sufficiently accurate to use.
if (!useDoublePrecision) {
cl::Kernel accuracyKernel(utilities, "determineNativeAccuracy");
OpenCLArray valuesArray(*this, 20, sizeof(mm_float8), "values");
vector<mm_float8> values(valuesArray.getSize());
......@@ -269,6 +303,14 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
compilationDefines["RECIP"] = (maxRecipError < 1e-6) ? "native_recip" : "1.0f/";
compilationDefines["EXP"] = (maxExpError < 1e-6) ? "native_exp" : "exp";
compilationDefines["LOG"] = (maxLogError < 1e-6) ? "native_log" : "log";
}
else {
compilationDefines["SQRT"] = "sqrt";
compilationDefines["RSQRT"] = "rsqrt";
compilationDefines["RECIP"] = "1.0/";
compilationDefines["EXP"] = "exp";
compilationDefines["LOG"] = "log";
}
// Create the work thread used for parallelization when running on multiple devices.
......@@ -311,18 +353,21 @@ OpenCLContext::~OpenCLContext() {
}
void OpenCLContext::initialize() {
vector<mm_float4> v(paddedNumAtoms, mm_float4(0, 0, 0, 0));
for (int i = 0; i < numAtoms; i++) {
double mass = system.getParticleMass(i);
v[i].w = (float) (mass == 0.0 ? 0.0 : 1.0/mass);
}
velm->upload(v);
bonded->initialize(system);
numForceBuffers = platformData.contexts.size();
numForceBuffers = std::max(numForceBuffers, bonded->getNumForceBuffers());
for (int i = 0; i < (int) forces.size(); i++)
numForceBuffers = std::max(numForceBuffers, forces[i]->getRequiredForceBuffers());
if (useDoublePrecision) {
forceBuffers = OpenCLArray::create<mm_double4>(*this, paddedNumAtoms*numForceBuffers, "forceBuffers");
force = OpenCLArray::create<mm_double4>(*this, &forceBuffers->getDeviceBuffer(), paddedNumAtoms, "force");
energyBuffer = OpenCLArray::create<cl_double>(*this, max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers()), "energyBuffer");
}
else {
forceBuffers = OpenCLArray::create<mm_float4>(*this, paddedNumAtoms*numForceBuffers, "forceBuffers");
force = OpenCLArray::create<mm_float4>(*this, &forceBuffers->getDeviceBuffer(), paddedNumAtoms, "force");
energyBuffer = OpenCLArray::create<cl_float>(*this, max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers()), "energyBuffer");
}
if (supports64BitGlobalAtomics) {
longForceBuffer = OpenCLArray::create<cl_long>(*this, 3*paddedNumAtoms, "longForceBuffer");
reduceForcesKernel.setArg<cl::Buffer>(0, longForceBuffer->getDeviceBuffer());
......@@ -332,12 +377,18 @@ void OpenCLContext::initialize() {
addAutoclearBuffer(longForceBuffer->getDeviceBuffer(), longForceBuffer->getSize()*2);
}
addAutoclearBuffer(forceBuffers->getDeviceBuffer(), forceBuffers->getSize()*4);
force = OpenCLArray::create<mm_float4>(*this, &forceBuffers->getDeviceBuffer(), paddedNumAtoms, "force");
energyBuffer = OpenCLArray::create<cl_float>(*this, max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers()), "energyBuffer");
addAutoclearBuffer(energyBuffer->getDeviceBuffer(), energyBuffer->getSize());
int bufferBytes = max(posq->getSize()*sizeof(mm_float4), energyBuffer->getSize()*sizeof(cl_float));
int bufferBytes = max(posq->getSize()*posq->getElementSize(), energyBuffer->getSize()*energyBuffer->getElementSize());
pinnedBuffer = new cl::Buffer(context, CL_MEM_ALLOC_HOST_PTR, bufferBytes);
pinnedMemory = queue.enqueueMapBuffer(*pinnedBuffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, bufferBytes);
for (int i = 0; i < numAtoms; i++) {
double mass = system.getParticleMass(i);
if (useDoublePrecision || useMixedPrecision)
((mm_double4*) pinnedMemory)[i] = mm_double4(0.0, 0.0, 0.0, mass == 0.0 ? 0.0 : 1.0/mass);
else
((mm_float4*) pinnedMemory)[i] = mm_float4(0.0f, 0.0f, 0.0f, mass == 0.0 ? 0.0f : (cl_float) (1.0/mass));
}
velm->upload(pinnedMemory);
atomIndexDevice = OpenCLArray::create<cl_int>(*this, paddedNumAtoms, "atomIndexDevice");
atomIndex.resize(paddedNumAtoms);
for (int i = 0; i < paddedNumAtoms; ++i)
......@@ -382,6 +433,28 @@ cl::Program OpenCLContext::createProgram(const string source, const map<string,
}
if (!compilationDefines.empty())
src << endl;
if (supportsDoublePrecision)
src << "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
if (useDoublePrecision) {
src << "typedef double real;\n";
src << "typedef double2 real2;\n";
src << "typedef double4 real4;\n";
}
else {
src << "typedef float real;\n";
src << "typedef float2 real2;\n";
src << "typedef float4 real4;\n";
}
if (useDoublePrecision || useMixedPrecision) {
src << "typedef double mixed;\n";
src << "typedef double2 mixed2;\n";
src << "typedef double4 mixed4;\n";
}
else {
src << "typedef float mixed;\n";
src << "typedef float2 mixed2;\n";
src << "typedef float4 mixed4;\n";
}
for (map<string, string>::const_iterator iter = defines.begin(); iter != defines.end(); ++iter) {
src << "#define " << iter->first;
if (!iter->second.empty())
......@@ -764,11 +837,47 @@ void OpenCLContext::validateMolecules() {
// atoms to their original order, rebuild the list of identical molecules, and sort them
// again.
vector<mm_int4> newCellOffsets(numAtoms);
if (useDoublePrecision) {
vector<mm_double4> oldPosq(paddedNumAtoms);
vector<mm_double4> newPosq(paddedNumAtoms);
vector<mm_double4> oldVelm(paddedNumAtoms);
vector<mm_double4> newVelm(paddedNumAtoms);
posq->download(oldPosq);
velm->download(oldVelm);
for (int i = 0; i < numAtoms; i++) {
int index = atomIndex[i];
newPosq[index] = oldPosq[i];
newVelm[index] = oldVelm[i];
newCellOffsets[index] = posCellOffsets[i];
}
posq->upload(newPosq);
velm->upload(newVelm);
}
else if (useMixedPrecision) {
vector<mm_float4> oldPosq(paddedNumAtoms);
vector<mm_float4> newPosq(paddedNumAtoms);
vector<mm_float4> oldPosqCorrection(paddedNumAtoms);
vector<mm_float4> newPosqCorrection(paddedNumAtoms);
vector<mm_double4> oldVelm(paddedNumAtoms);
vector<mm_double4> newVelm(paddedNumAtoms);
posq->download(oldPosq);
velm->download(oldVelm);
for (int i = 0; i < numAtoms; i++) {
int index = atomIndex[i];
newPosq[index] = oldPosq[i];
newPosqCorrection[index] = oldPosqCorrection[i];
newVelm[index] = oldVelm[i];
newCellOffsets[index] = posCellOffsets[i];
}
posq->upload(newPosq);
velm->upload(newVelm);
}
else {
vector<mm_float4> oldPosq(paddedNumAtoms);
vector<mm_float4> newPosq(paddedNumAtoms);
vector<mm_float4> oldVelm(paddedNumAtoms);
vector<mm_float4> newVelm(paddedNumAtoms);
vector<mm_int4> newCellOffsets(numAtoms);
posq->download(oldPosq);
velm->download(oldVelm);
for (int i = 0; i < numAtoms; i++) {
......@@ -779,12 +888,11 @@ void OpenCLContext::validateMolecules() {
}
posq->upload(newPosq);
velm->upload(newVelm);
}
for (int i = 0; i < numAtoms; i++) {
atomIndex[i] = i;
posCellOffsets[i] = newCellOffsets[i];
}
posq->upload(newPosq);
velm->upload(newVelm);
atomIndexDevice->upload(atomIndex);
findMoleculeGroups();
for (int i = 0; i < (int) reorderListeners.size(); i++)
......@@ -797,16 +905,29 @@ void OpenCLContext::reorderAtoms(bool enforcePeriodic) {
if (moleculesInvalid)
validateMolecules();
atomsWereReordered = true;
if (useDoublePrecision)
reorderAtomsImpl<cl_double, mm_double4, cl_double, mm_double4>(enforcePeriodic);
else if (useMixedPrecision)
reorderAtomsImpl<cl_float, mm_float4, cl_double, mm_double4>(enforcePeriodic);
else
reorderAtomsImpl<cl_float, mm_float4, cl_float, mm_float4>(enforcePeriodic);
}
template <class Real, class Real4, class Mixed, class Mixed4>
void OpenCLContext::reorderAtomsImpl(bool enforcePeriodic) {
// Find the range of positions and the number of bins along each axis.
vector<mm_float4> oldPosq(paddedNumAtoms);
vector<mm_float4> oldVelm(paddedNumAtoms);
vector<Real4> oldPosq(paddedNumAtoms);
vector<Real4> oldPosqCorrection(paddedNumAtoms);
vector<Mixed4> oldVelm(paddedNumAtoms);
posq->download(oldPosq);
velm->download(oldVelm);
float minx = oldPosq[0].x, maxx = oldPosq[0].x;
float miny = oldPosq[0].y, maxy = oldPosq[0].y;
float minz = oldPosq[0].z, maxz = oldPosq[0].z;
if (useMixedPrecision)
posqCorrection->download(oldPosqCorrection);
Real minx = oldPosq[0].x, maxx = oldPosq[0].x;
Real miny = oldPosq[0].y, maxy = oldPosq[0].y;
Real minz = oldPosq[0].z, maxz = oldPosq[0].z;
if (nonbonded->getUsePeriodic()) {
minx = miny = minz = 0.0;
maxx = periodicBoxSize.x;
......@@ -815,7 +936,7 @@ void OpenCLContext::reorderAtoms(bool enforcePeriodic) {
}
else {
for (int i = 1; i < numAtoms; i++) {
const mm_float4& pos = oldPosq[i];
const Real4& pos = oldPosq[i];
minx = min(minx, pos.x);
maxx = max(maxx, pos.x);
miny = min(miny, pos.y);
......@@ -828,8 +949,9 @@ void OpenCLContext::reorderAtoms(bool enforcePeriodic) {
// Loop over each group of identical molecules and reorder them.
vector<int> originalIndex(numAtoms);
vector<mm_float4> newPosq(paddedNumAtoms);
vector<mm_float4> newVelm(paddedNumAtoms);
vector<Real4> newPosq(paddedNumAtoms);
vector<Real4> newPosqCorrection(paddedNumAtoms);
vector<Mixed4> newVelm(paddedNumAtoms);
vector<mm_int4> newCellOffsets(numAtoms);
for (int group = 0; group < (int) moleculeGroups.size(); group++) {
// Find the center of each molecule.
......@@ -837,15 +959,15 @@ void OpenCLContext::reorderAtoms(bool enforcePeriodic) {
MoleculeGroup& mol = moleculeGroups[group];
int numMolecules = mol.offsets.size();
vector<int>& atoms = mol.atoms;
vector<mm_float4> molPos(numMolecules);
float invNumAtoms = 1.0f/atoms.size();
vector<Real4> molPos(numMolecules);
Real invNumAtoms = (Real) (1.0/atoms.size());
for (int i = 0; i < numMolecules; i++) {
molPos[i].x = 0.0f;
molPos[i].y = 0.0f;
molPos[i].z = 0.0f;
for (int j = 0; j < (int)atoms.size(); j++) {
int atom = atoms[j]+mol.offsets[i];
const mm_float4& pos = oldPosq[atom];
const Real4& pos = oldPosq[atom];
molPos[i].x += pos.x;
molPos[i].y += pos.y;
molPos[i].z += pos.z;
......@@ -861,9 +983,9 @@ void OpenCLContext::reorderAtoms(bool enforcePeriodic) {
int xcell = (int) floor(molPos[i].x*invPeriodicBoxSize.x);
int ycell = (int) floor(molPos[i].y*invPeriodicBoxSize.y);
int zcell = (int) floor(molPos[i].z*invPeriodicBoxSize.z);
float dx = xcell*periodicBoxSize.x;
float dy = ycell*periodicBoxSize.y;
float dz = zcell*periodicBoxSize.z;
Real dx = xcell*periodicBoxSize.x;
Real dy = ycell*periodicBoxSize.y;
Real dz = zcell*periodicBoxSize.z;
if (dx != 0.0f || dy != 0.0f || dz != 0.0f) {
molPos[i].x -= dx;
molPos[i].y -= dy;
......@@ -871,7 +993,7 @@ void OpenCLContext::reorderAtoms(bool enforcePeriodic) {
if (enforcePeriodic) {
for (int j = 0; j < (int) atoms.size(); j++) {
int atom = atoms[j]+mol.offsets[i];
mm_float4 p = oldPosq[atom];
Real4 p = oldPosq[atom];
p.x -= dx;
p.y -= dy;
p.z -= dz;
......@@ -888,12 +1010,12 @@ void OpenCLContext::reorderAtoms(bool enforcePeriodic) {
// Select a bin for each molecule, then sort them by bin.
bool useHilbert = (numMolecules > 5000 || atoms.size() > 8); // For small systems, a simple zigzag curve works better than a Hilbert curve.
float binWidth;
Real binWidth;
if (useHilbert)
binWidth = (float)(max(max(maxx-minx, maxy-miny), maxz-minz)/255.0);
binWidth = (Real) (max(max(maxx-minx, maxy-miny), maxz-minz)/255.0);
else
binWidth = (float)(0.2*nonbonded->getCutoffDistance());
float invBinWidth = 1.0f/binWidth;
binWidth = (Real) (0.2*nonbonded->getCutoffDistance());
Real invBinWidth = (Real) (1.0/binWidth);
int xbins = 1 + (int) ((maxx-minx)*invBinWidth);
int ybins = 1 + (int) ((maxy-miny)*invBinWidth);
vector<pair<int, int> > molBins(numMolecules);
......@@ -928,6 +1050,8 @@ void OpenCLContext::reorderAtoms(bool enforcePeriodic) {
int newIndex = mol.offsets[i]+atoms[j];
originalIndex[newIndex] = atomIndex[oldIndex];
newPosq[newIndex] = oldPosq[oldIndex];
if (useMixedPrecision)
newPosqCorrection[newIndex] = oldPosqCorrection[oldIndex];
newVelm[newIndex] = oldVelm[oldIndex];
newCellOffsets[newIndex] = posCellOffsets[oldIndex];
}
......@@ -941,6 +1065,8 @@ void OpenCLContext::reorderAtoms(bool enforcePeriodic) {
posCellOffsets[i] = newCellOffsets[i];
}
posq->upload(newPosq);
if (useMixedPrecision)
posqCorrection->upload(newPosqCorrection);
velm->upload(newVelm);
atomIndexDevice->upload(atomIndex);
for (int i = 0; i < (int) reorderListeners.size(); i++)
......
......@@ -62,7 +62,7 @@ struct mm_float2 {
mm_float2(cl_float x, cl_float y) : x(x), y(y) {
}
};
struct mm_float4 {
struct mm_float4 {
cl_float x, y, z, w;
mm_float4() {
}
......@@ -87,6 +87,20 @@ struct mm_float16 {
s8(s8), s9(s9), s10(s10), s11(s11), s12(s12), s13(s13), s14(s14), s15(15) {
}
};
struct mm_double2 {
cl_double x, y;
mm_double2() {
}
mm_double2(cl_double x, cl_double y) : x(x), y(y) {
}
};
struct mm_double4 {
cl_double x, y, z, w;
mm_double4() {
}
mm_double4(cl_double x, cl_double y, cl_double z, cl_double w) : x(x), y(y), z(z), w(w) {
}
};
struct mm_ushort2 {
cl_ushort x, y;
mm_ushort2() {
......@@ -145,7 +159,7 @@ public:
class ReorderListener;
static const int ThreadBlockSize;
static const int TileSize;
OpenCLContext(const System& system, int platformIndex, int deviceIndex, OpenCLPlatform::PlatformData& platformData);
OpenCLContext(const System& system, int platformIndex, int deviceIndex, const std::string& precision, OpenCLPlatform::PlatformData& platformData);
~OpenCLContext();
/**
* This is called to initialize internal data structures after all Forces in the system
......@@ -198,6 +212,12 @@ public:
OpenCLArray& getPosq() {
return *posq;
}
/**
* Get the array which contains a correction to the position of each atom. This only exists if getUseMixedPrecision() returns true.
*/
OpenCLArray& getPosqCorrection() {
return *posqCorrection;
}
/**
* Get the array which contains the velocity (the xyz components) and inverse mass (the w component) of each atom.
*/
......@@ -405,18 +425,38 @@ public:
bool getSupportsDoublePrecision() {
return supportsDoublePrecision;
}
/**
* Get whether double precision is being used.
*/
bool getUseDoublePrecision() {
return useDoublePrecision;
}
/**
* Get whether mixed precision is being used.
*/
bool getUseMixedPrecision() {
return useMixedPrecision;
}
/**
* Get the size of the periodic box.
*/
mm_float4 getPeriodicBoxSize() const {
return periodicBoxSize;
}
/**
* Get the size of the periodic box.
*/
mm_double4 getPeriodicBoxSizeDouble() const {
return periodicBoxSizeDouble;
}
/**
* Set the size of the periodic box.
*/
void setPeriodicBoxSize(double xsize, double ysize, double zsize) {
periodicBoxSize = mm_float4((float) xsize, (float) ysize, (float) zsize, 0);
invPeriodicBoxSize = mm_float4((float) (1.0/xsize), (float) (1.0/ysize), (float) (1.0/zsize), 0);
periodicBoxSizeDouble = mm_double4(xsize, ysize, zsize, 0);
invPeriodicBoxSizeDouble = mm_double4(1.0/xsize, 1.0/ysize, 1.0/zsize, 0);
}
/**
* Get the inverse of the size of the periodic box.
......@@ -424,6 +464,12 @@ public:
mm_float4 getInvPeriodicBoxSize() const {
return invPeriodicBoxSize;
}
/**
* Get the inverse of the size of the periodic box.
*/
mm_double4 getInvPeriodicBoxSizeDouble() const {
return invPeriodicBoxSizeDouble;
}
/**
* Get the OpenCLIntegrationUtilities for this context.
*/
......@@ -502,6 +548,11 @@ private:
* of molecules and resort the atoms.
*/
void validateMolecules();
/**
* This is the internal implementation of reorderAtoms(), templatized by the numerical precision in use.
*/
template <class Real, class Real4, class Mixed, class Mixed4>
void reorderAtomsImpl(bool enforcePeriodic);
const System& system;
double time;
OpenCLPlatform::PlatformData& platformData;
......@@ -515,9 +566,9 @@ private:
int numThreadBlocks;
int numForceBuffers;
int simdWidth;
bool supports64BitGlobalAtomics, supportsDoublePrecision, atomsWereReordered, moleculesInvalid;
mm_float4 periodicBoxSize;
mm_float4 invPeriodicBoxSize;
bool supports64BitGlobalAtomics, supportsDoublePrecision, useDoublePrecision, useMixedPrecision, atomsWereReordered, moleculesInvalid;
mm_float4 periodicBoxSize, invPeriodicBoxSize;
mm_double4 periodicBoxSizeDouble, invPeriodicBoxSizeDouble;
std::string defaultOptimizationOptions;
std::map<std::string, std::string> compilationDefines;
cl::Context context;
......@@ -538,6 +589,7 @@ private:
cl::Buffer* pinnedBuffer;
void* pinnedMemory;
OpenCLArray* posq;
OpenCLArray* posqCorrection;
OpenCLArray* velm;
OpenCLArray* force;
OpenCLArray* forceBuffers;
......
......@@ -87,6 +87,13 @@ struct OpenCLIntegrationUtilities::ConstraintOrderer : public binary_function<in
}
};
static void setPosqCorrectionArg(OpenCLContext& cl, cl::Kernel& kernel, int index) {
if (cl.getUseMixedPrecision())
kernel.setArg<cl::Buffer>(index, cl.getPosqCorrection().getDeviceBuffer());
else
kernel.setArg<void*>(index, NULL);
}
OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, const System& system) : context(context),
posDelta(NULL), settleAtoms(NULL), settleParams(NULL), shakeAtoms(NULL), shakeParams(NULL),
random(NULL), randomSeed(NULL), randomPos(0), stepSize(NULL), ccmaAtoms(NULL), ccmaDistance(NULL),
......@@ -96,12 +103,22 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
vsiteOutOfPlaneAtoms(NULL), vsiteOutOfPlaneWeights(NULL), hasInitializedPosConstraintKernels(false), hasInitializedVelConstraintKernels(false) {
// Create workspace arrays.
if (context.getUseDoublePrecision() || context.getUseMixedPrecision()) {
posDelta = OpenCLArray::create<mm_double4>(context, context.getPaddedNumAtoms(), "posDelta");
vector<mm_double4> deltas(posDelta->getSize(), mm_double4(0.0, 0.0, 0.0, 0.0));
posDelta->upload(deltas);
stepSize = OpenCLArray::create<mm_double2>(context, 1, "stepSize");
vector<mm_double2> step(1, mm_double2(0.0, 0.0));
stepSize->upload(step);
}
else {
posDelta = OpenCLArray::create<mm_float4>(context, context.getPaddedNumAtoms(), "posDelta");
vector<mm_float4> deltas(posDelta->getSize(), mm_float4(0.0, 0.0, 0.0, 0.0));
vector<mm_float4> deltas(posDelta->getSize(), mm_float4(0.0f, 0.0f, 0.0f, 0.0f));
posDelta->upload(deltas);
stepSize = OpenCLArray::create<mm_float2>(context, 1, "stepSize");
vector<mm_float2> step(1, mm_float2(0.0f, 0.0f));
stepSize->upload(step);
}
// Create kernels for enforcing constraints.
......@@ -458,23 +475,57 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
// Record the CCMA data structures.
ccmaAtoms = OpenCLArray::create<mm_int2>(context, numCCMA, "CcmaAtoms");
ccmaDistance = OpenCLArray::create<mm_float4>(context, numCCMA, "CcmaDistance");
ccmaAtomConstraints = OpenCLArray::create<cl_int>(context, numAtoms*maxAtomConstraints, "CcmaAtomConstraints");
ccmaNumAtomConstraints = OpenCLArray::create<cl_int>(context, numAtoms, "CcmaAtomConstraintsIndex");
ccmaDelta1 = OpenCLArray::create<cl_float>(context, numCCMA, "CcmaDelta1");
ccmaDelta2 = OpenCLArray::create<cl_float>(context, numCCMA, "CcmaDelta2");
ccmaConstraintMatrixColumn = OpenCLArray::create<cl_int>(context, numCCMA*maxRowElements, "ConstraintMatrixColumn");
ccmaConverged = OpenCLArray::create<cl_int>(context, 2, "CcmaConverged");
ccmaConvergedBuffer = new cl::Buffer(context.getContext(), CL_MEM_ALLOC_HOST_PTR, 2*sizeof(cl_int));
ccmaConvergedMemory = (cl_int*) context.getQueue().enqueueMapBuffer(*ccmaConvergedBuffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, 2*sizeof(cl_int));
ccmaReducedMass = OpenCLArray::create<cl_float>(context, numCCMA, "CcmaReducedMass");
ccmaConstraintMatrixColumn = OpenCLArray::create<cl_int>(context, numCCMA*maxRowElements, "ConstraintMatrixColumn");
ccmaConstraintMatrixValue = OpenCLArray::create<cl_float>(context, numCCMA*maxRowElements, "ConstraintMatrixValue");
vector<mm_int2> atomsVec(ccmaAtoms->getSize());
vector<mm_float4> distanceVec(ccmaDistance->getSize());
vector<cl_int> atomConstraintsVec(ccmaAtomConstraints->getSize());
vector<cl_int> numAtomConstraintsVec(ccmaNumAtomConstraints->getSize());
vector<cl_float> reducedMassVec(ccmaReducedMass->getSize());
vector<cl_int> constraintMatrixColumnVec(ccmaConstraintMatrixColumn->getSize());
if (context.getUseDoublePrecision() || context.getUseMixedPrecision()) {
ccmaDistance = OpenCLArray::create<mm_double4>(context, numCCMA, "CcmaDistance");
ccmaDelta1 = OpenCLArray::create<cl_double>(context, numCCMA, "CcmaDelta1");
ccmaDelta2 = OpenCLArray::create<cl_double>(context, numCCMA, "CcmaDelta2");
ccmaReducedMass = OpenCLArray::create<cl_double>(context, numCCMA, "CcmaReducedMass");
ccmaConstraintMatrixValue = OpenCLArray::create<cl_double>(context, numCCMA*maxRowElements, "ConstraintMatrixValue");
vector<mm_double4> distanceVec(ccmaDistance->getSize());
vector<cl_double> reducedMassVec(ccmaReducedMass->getSize());
vector<cl_double> constraintMatrixValueVec(ccmaConstraintMatrixValue->getSize());
for (int i = 0; i < numCCMA; i++) {
int index = constraintOrder[i];
int c = ccmaConstraints[index];
atomsVec[i].x = atom1[c];
atomsVec[i].y = atom2[c];
distanceVec[i].w = distance[c];
reducedMassVec[i] = (0.5/(1.0/system.getParticleMass(atom1[c])+1.0/system.getParticleMass(atom2[c])));
for (unsigned int j = 0; j < matrix[index].size(); j++) {
constraintMatrixColumnVec[i+j*numCCMA] = matrix[index][j].first;
constraintMatrixValueVec[i+j*numCCMA] = matrix[index][j].second;
}
constraintMatrixColumnVec[i+matrix[index].size()*numCCMA] = numCCMA;
}
for (unsigned int i = 0; i < atomConstraints.size(); i++) {
numAtomConstraintsVec[i] = atomConstraints[i].size();
for (unsigned int j = 0; j < atomConstraints[i].size(); j++) {
bool forward = (atom1[ccmaConstraints[atomConstraints[i][j]]] == i);
atomConstraintsVec[i+j*numAtoms] = (forward ? inverseOrder[atomConstraints[i][j]]+1 : -inverseOrder[atomConstraints[i][j]]-1);
}
}
ccmaDistance->upload(distanceVec);
ccmaReducedMass->upload(reducedMassVec);
ccmaConstraintMatrixValue->upload(constraintMatrixValueVec);
}
else {
ccmaDistance = OpenCLArray::create<mm_float4>(context, numCCMA, "CcmaDistance");
ccmaDelta1 = OpenCLArray::create<cl_float>(context, numCCMA, "CcmaDelta1");
ccmaDelta2 = OpenCLArray::create<cl_float>(context, numCCMA, "CcmaDelta2");
ccmaReducedMass = OpenCLArray::create<cl_float>(context, numCCMA, "CcmaReducedMass");
ccmaConstraintMatrixValue = OpenCLArray::create<cl_float>(context, numCCMA*maxRowElements, "ConstraintMatrixValue");
vector<mm_float4> distanceVec(ccmaDistance->getSize());
vector<cl_float> reducedMassVec(ccmaReducedMass->getSize());
vector<cl_float> constraintMatrixValueVec(ccmaConstraintMatrixValue->getSize());
for (int i = 0; i < numCCMA; i++) {
int index = constraintOrder[i];
......@@ -496,13 +547,14 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
atomConstraintsVec[i+j*numAtoms] = (forward ? inverseOrder[atomConstraints[i][j]]+1 : -inverseOrder[atomConstraints[i][j]]-1);
}
}
ccmaAtoms->upload(atomsVec);
ccmaDistance->upload(distanceVec);
ccmaReducedMass->upload(reducedMassVec);
ccmaConstraintMatrixValue->upload(constraintMatrixValueVec);
}
ccmaAtoms->upload(atomsVec);
ccmaAtomConstraints->upload(atomConstraintsVec);
ccmaNumAtomConstraints->upload(numAtomConstraintsVec);
ccmaReducedMass->upload(reducedMassVec);
ccmaConstraintMatrixColumn->upload(constraintMatrixColumnVec);
ccmaConstraintMatrixValue->upload(constraintMatrixValueVec);
// Create the CCMA kernels.
......@@ -584,21 +636,23 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
cl::Program vsiteProgram = context.createProgram(OpenCLKernelSources::virtualSites, defines);
vsitePositionKernel = cl::Kernel(vsiteProgram, "computeVirtualSites");
vsitePositionKernel.setArg<cl::Buffer>(0, context.getPosq().getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(1, vsite2AvgAtoms->getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(2, vsite2AvgWeights->getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(3, vsite3AvgAtoms->getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(4, vsite3AvgWeights->getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(5, vsiteOutOfPlaneAtoms->getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(6, vsiteOutOfPlaneWeights->getDeviceBuffer());
setPosqCorrectionArg(context, vsitePositionKernel, 1);
vsitePositionKernel.setArg<cl::Buffer>(2, vsite2AvgAtoms->getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(3, vsite2AvgWeights->getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(4, vsite3AvgAtoms->getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(5, vsite3AvgWeights->getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(6, vsiteOutOfPlaneAtoms->getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(7, vsiteOutOfPlaneWeights->getDeviceBuffer());
vsiteForceKernel = cl::Kernel(vsiteProgram, "distributeForces");
vsiteForceKernel.setArg<cl::Buffer>(0, context.getPosq().getDeviceBuffer());
// Skip argument 1: the force array hasn't been created yet.
vsiteForceKernel.setArg<cl::Buffer>(2, vsite2AvgAtoms->getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(3, vsite2AvgWeights->getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(4, vsite3AvgAtoms->getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(5, vsite3AvgWeights->getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(6, vsiteOutOfPlaneAtoms->getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(7, vsiteOutOfPlaneWeights->getDeviceBuffer());
setPosqCorrectionArg(context, vsiteForceKernel, 1);
// Skip argument 2: the force array hasn't been created yet.
vsiteForceKernel.setArg<cl::Buffer>(3, vsite2AvgAtoms->getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(4, vsite2AvgWeights->getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(5, vsite3AvgAtoms->getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(6, vsite3AvgWeights->getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(7, vsiteOutOfPlaneAtoms->getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(8, vsiteOutOfPlaneWeights->getDeviceBuffer());
numVsites = num2Avg+num3Avg+numOutOfPlane;
}
......@@ -686,11 +740,18 @@ void OpenCLIntegrationUtilities::applyConstraints(bool constrainVelocities, doub
if (!hasInitialized) {
settleKernel.setArg<cl_int>(0, settleAtoms->getSize());
settleKernel.setArg<cl::Buffer>(2, context.getPosq().getDeviceBuffer());
settleKernel.setArg<cl::Buffer>(3, posDelta->getDeviceBuffer());
settleKernel.setArg<cl::Buffer>(4, context.getVelm().getDeviceBuffer());
settleKernel.setArg<cl::Buffer>(5, settleAtoms->getDeviceBuffer());
settleKernel.setArg<cl::Buffer>(6, settleParams->getDeviceBuffer());
}
if (context.getUseMixedPrecision())
settleKernel.setArg<cl::Buffer>(3, context.getPosqCorrection().getDeviceBuffer());
else
settleKernel.setArg<void*>(3, NULL);
settleKernel.setArg<cl::Buffer>(4, posDelta->getDeviceBuffer());
settleKernel.setArg<cl::Buffer>(5, context.getVelm().getDeviceBuffer());
settleKernel.setArg<cl::Buffer>(6, settleAtoms->getDeviceBuffer());
settleKernel.setArg<cl::Buffer>(7, settleParams->getDeviceBuffer());
}
if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
settleKernel.setArg<cl_double>(1, (cl_double) tol);
else
settleKernel.setArg<cl_float>(1, (cl_float) tol);
context.executeKernel(settleKernel, settleAtoms->getSize());
}
......@@ -698,10 +759,17 @@ void OpenCLIntegrationUtilities::applyConstraints(bool constrainVelocities, doub
if (!hasInitialized) {
shakeKernel.setArg<cl_int>(0, shakeAtoms->getSize());
shakeKernel.setArg<cl::Buffer>(2, context.getPosq().getDeviceBuffer());
shakeKernel.setArg<cl::Buffer>(3, constrainVelocities ? context.getVelm().getDeviceBuffer() : posDelta->getDeviceBuffer());
shakeKernel.setArg<cl::Buffer>(4, shakeAtoms->getDeviceBuffer());
shakeKernel.setArg<cl::Buffer>(5, shakeParams->getDeviceBuffer());
if (context.getUseMixedPrecision())
shakeKernel.setArg<cl::Buffer>(3, context.getPosqCorrection().getDeviceBuffer());
else
shakeKernel.setArg<void*>(3, NULL);
shakeKernel.setArg<cl::Buffer>(4, constrainVelocities ? context.getVelm().getDeviceBuffer() : posDelta->getDeviceBuffer());
shakeKernel.setArg<cl::Buffer>(5, shakeAtoms->getDeviceBuffer());
shakeKernel.setArg<cl::Buffer>(6, shakeParams->getDeviceBuffer());
}
if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
shakeKernel.setArg<cl_double>(1, (cl_double) tol);
else
shakeKernel.setArg<cl_float>(1, (cl_float) tol);
context.executeKernel(shakeKernel, shakeAtoms->getSize());
}
......@@ -710,6 +778,10 @@ void OpenCLIntegrationUtilities::applyConstraints(bool constrainVelocities, doub
ccmaDirectionsKernel.setArg<cl::Buffer>(0, ccmaAtoms->getDeviceBuffer());
ccmaDirectionsKernel.setArg<cl::Buffer>(1, ccmaDistance->getDeviceBuffer());
ccmaDirectionsKernel.setArg<cl::Buffer>(2, context.getPosq().getDeviceBuffer());
if (context.getUseMixedPrecision())
ccmaDirectionsKernel.setArg<cl::Buffer>(3, context.getPosqCorrection().getDeviceBuffer());
else
ccmaDirectionsKernel.setArg<void*>(3, NULL);
ccmaForceKernel.setArg<cl::Buffer>(0, ccmaAtoms->getDeviceBuffer());
ccmaForceKernel.setArg<cl::Buffer>(1, ccmaDistance->getDeviceBuffer());
ccmaForceKernel.setArg<cl::Buffer>(2, constrainVelocities ? context.getVelm().getDeviceBuffer() : posDelta->getDeviceBuffer());
......@@ -730,6 +802,9 @@ void OpenCLIntegrationUtilities::applyConstraints(bool constrainVelocities, doub
ccmaUpdateKernel.setArg<cl::Buffer>(6, ccmaDelta2->getDeviceBuffer());
ccmaUpdateKernel.setArg<cl::Buffer>(7, ccmaConverged->getDeviceBuffer());
}
if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
ccmaForceKernel.setArg<cl_double>(6, (cl_double) tol);
else
ccmaForceKernel.setArg<cl_float>(6, (cl_float) tol);
context.executeKernel(ccmaDirectionsKernel, ccmaAtoms->getSize());
const int checkInterval = 4;
......@@ -764,7 +839,7 @@ void OpenCLIntegrationUtilities::computeVirtualSites() {
void OpenCLIntegrationUtilities::distributeForcesFromVirtualSites() {
if (numVsites > 0) {
vsiteForceKernel.setArg<cl::Buffer>(1, context.getForce().getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(2, context.getForce().getDeviceBuffer());
context.executeKernel(vsiteForceKernel, numVsites);
}
}
......
......@@ -66,6 +66,13 @@ static string intToString(int value) {
return s.str();
}
static void setPosqCorrectionArg(OpenCLContext& cl, cl::Kernel& kernel, int index) {
if (cl.getUseMixedPrecision())
kernel.setArg<cl::Buffer>(index, cl.getPosqCorrection().getDeviceBuffer());
else
kernel.setArg<void*>(index, NULL);
}
static bool isZeroExpression(const Lepton::ParsedExpression& expression) {
const Lepton::Operation& op = expression.getRootNode().getOperation();
if (op.getId() != Lepton::Operation::CONSTANT)
......@@ -139,24 +146,62 @@ void OpenCLUpdateStateDataKernel::setTime(ContextImpl& context, double time) {
}
void OpenCLUpdateStateDataKernel::getPositions(ContextImpl& context, vector<Vec3>& positions) {
mm_float4* posq = (mm_float4*) cl.getPinnedBuffer();
cl.getPosq().download(posq);
const vector<cl_int>& order = cl.getAtomIndex();
int numParticles = context.getSystem().getNumParticles();
positions.resize(numParticles);
mm_float4 periodicBoxSize = cl.getPeriodicBoxSize();
mm_double4 periodicBoxSize = cl.getPeriodicBoxSizeDouble();
if (cl.getUseDoublePrecision()) {
mm_double4* posq = (mm_double4*) cl.getPinnedBuffer();
cl.getPosq().download(posq);
for (int i = 0; i < numParticles; ++i) {
mm_double4 pos = posq[i];
mm_int4 offset = cl.getPosCellOffsets()[i];
positions[order[i]] = Vec3(pos.x-offset.x*periodicBoxSize.x, pos.y-offset.y*periodicBoxSize.y, pos.z-offset.z*periodicBoxSize.z);
}
}
else if (cl.getUseMixedPrecision()) {
mm_float4* posq = (mm_float4*) cl.getPinnedBuffer();
vector<mm_float4> posCorrection;
cl.getPosq().download(posq);
cl.getPosqCorrection().download(posCorrection);
for (int i = 0; i < numParticles; ++i) {
mm_float4 pos1 = posq[i];
mm_float4 pos2 = posCorrection[i];
mm_int4 offset = cl.getPosCellOffsets()[i];
positions[order[i]] = Vec3((double)pos1.x+(double)pos2.x-offset.x*periodicBoxSize.x, (double)pos1.y+(double)pos2.y-offset.y*periodicBoxSize.y, (double)pos1.z+(double)pos2.z-offset.z*periodicBoxSize.z);
}
}
else {
mm_float4* posq = (mm_float4*) cl.getPinnedBuffer();
cl.getPosq().download(posq);
for (int i = 0; i < numParticles; ++i) {
mm_float4 pos = posq[i];
mm_int4 offset = cl.getPosCellOffsets()[i];
positions[order[i]] = Vec3(pos.x-offset.x*periodicBoxSize.x, pos.y-offset.y*periodicBoxSize.y, pos.z-offset.z*periodicBoxSize.z);
}
}
}
void OpenCLUpdateStateDataKernel::setPositions(ContextImpl& context, const vector<Vec3>& positions) {
mm_float4* posq = (mm_float4*) cl.getPinnedBuffer();
cl.getPosq().download(posq);
const vector<cl_int>& order = cl.getAtomIndex();
int numParticles = context.getSystem().getNumParticles();
if (cl.getUseDoublePrecision()) {
mm_double4* posq = (mm_double4*) cl.getPinnedBuffer();
cl.getPosq().download(posq);
for (int i = 0; i < numParticles; ++i) {
mm_double4& pos = posq[i];
const Vec3& p = positions[order[i]];
pos.x = p[0];
pos.y = p[1];
pos.z = p[2];
}
for (int i = numParticles; i < cl.getPaddedNumAtoms(); i++)
posq[i] = mm_double4(0.0, 0.0, 0.0, 0.0);
cl.getPosq().upload(posq);
}
else {
mm_float4* posq = (mm_float4*) cl.getPinnedBuffer();
cl.getPosq().download(posq);
for (int i = 0; i < numParticles; ++i) {
mm_float4& pos = posq[i];
const Vec3& p = positions[order[i]];
......@@ -167,53 +212,106 @@ void OpenCLUpdateStateDataKernel::setPositions(ContextImpl& context, const vecto
for (int i = numParticles; i < cl.getPaddedNumAtoms(); i++)
posq[i] = mm_float4(0.0f, 0.0f, 0.0f, 0.0f);
cl.getPosq().upload(posq);
}
if (cl.getUseMixedPrecision()) {
mm_float4* posCorrection = (mm_float4*) cl.getPinnedBuffer();
for (int i = 0; i < numParticles; ++i) {
mm_float4& c = posCorrection[i];
const Vec3& p = positions[order[i]];
c.x = (cl_float) (p[0]-(cl_float)p[0]);
c.y = (cl_float) (p[1]-(cl_float)p[1]);
c.z = (cl_float) (p[2]-(cl_float)p[2]);
c.w = 0;
}
for (int i = numParticles; i < cl.getPaddedNumAtoms(); i++)
posCorrection[i] = mm_float4(0.0f, 0.0f, 0.0f, 0.0f);
cl.getPosqCorrection().upload(posCorrection);
}
for (int i = 0; i < (int) cl.getPosCellOffsets().size(); i++)
cl.getPosCellOffsets()[i] = mm_int4(0, 0, 0, 0);
}
void OpenCLUpdateStateDataKernel::getVelocities(ContextImpl& context, vector<Vec3>& velocities) {
mm_float4* velm = (mm_float4*) cl.getPinnedBuffer();
cl.getVelm().download(velm);
const vector<cl_int>& order = cl.getAtomIndex();
int numParticles = context.getSystem().getNumParticles();
velocities.resize(numParticles);
if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()) {
mm_double4* velm = (mm_double4*) cl.getPinnedBuffer();
cl.getVelm().download(velm);
for (int i = 0; i < numParticles; ++i) {
mm_double4 vel = velm[i];
mm_int4 offset = cl.getPosCellOffsets()[i];
velocities[order[i]] = Vec3(vel.x, vel.y, vel.z);
}
}
else {
mm_float4* velm = (mm_float4*) cl.getPinnedBuffer();
cl.getVelm().download(velm);
for (int i = 0; i < numParticles; ++i) {
mm_float4 vel = velm[i];
mm_int4 offset = cl.getPosCellOffsets()[i];
velocities[order[i]] = Vec3(vel.x, vel.y, vel.z);
}
}
}
void OpenCLUpdateStateDataKernel::setVelocities(ContextImpl& context, const vector<Vec3>& velocities) {
mm_float4* velm = (mm_float4*) cl.getPinnedBuffer();
cl.getVelm().download(velm);
const vector<cl_int>& order = cl.getAtomIndex();
int numParticles = context.getSystem().getNumParticles();
if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()) {
mm_double4* velm = (mm_double4*) cl.getPinnedBuffer();
cl.getVelm().download(velm);
for (int i = 0; i < numParticles; ++i) {
mm_double4& vel = velm[i];
const Vec3& p = velocities[order[i]];
vel.x = p[0];
vel.y = p[1];
vel.z = p[2];
}
for (int i = numParticles; i < cl.getPaddedNumAtoms(); i++)
velm[i] = mm_double4(0.0, 0.0, 0.0, 0.0);
cl.getVelm().upload(velm);
}
else {
mm_float4* velm = (mm_float4*) cl.getPinnedBuffer();
cl.getVelm().download(velm);
for (int i = 0; i < numParticles; ++i) {
mm_float4& vel = velm[i];
const Vec3& p = velocities[order[i]];
vel.x = (cl_float) p[0];
vel.y = (cl_float) p[1];
vel.z = (cl_float) p[2];
vel.x = p[0];
vel.y = p[1];
vel.z = p[2];
}
for (int i = numParticles; i < cl.getPaddedNumAtoms(); i++)
velm[i] = mm_float4(0.0f, 0.0f, 0.0f, 0.0f);
cl.getVelm().upload(velm);
}
}
void OpenCLUpdateStateDataKernel::getForces(ContextImpl& context, vector<Vec3>& forces) {
mm_float4* force = (mm_float4*) cl.getPinnedBuffer();
cl.getForce().download(force);
const vector<cl_int>& order = cl.getAtomIndex();
int numParticles = context.getSystem().getNumParticles();
forces.resize(numParticles);
if (cl.getUseDoublePrecision()) {
mm_double4* force = (mm_double4*) cl.getPinnedBuffer();
cl.getForce().download(force);
for (int i = 0; i < numParticles; ++i) {
mm_double4 f = force[i];
forces[order[i]] = Vec3(f.x, f.y, f.z);
}
}
else {
mm_float4* force = (mm_float4*) cl.getPinnedBuffer();
cl.getForce().download(force);
for (int i = 0; i < numParticles; ++i) {
mm_float4 f = force[i];
forces[order[i]] = Vec3(f.x, f.y, f.z);
}
}
}
void OpenCLUpdateStateDataKernel::getPeriodicBoxVectors(ContextImpl& context, Vec3& a, Vec3& b, Vec3& c) const {
mm_float4 box = cl.getPeriodicBoxSize();
mm_double4 box = cl.getPeriodicBoxSizeDouble();
a = Vec3(box.x, 0, 0);
b = Vec3(0, box.y, 0);
c = Vec3(0, 0, box.z);
......@@ -228,6 +326,8 @@ void OpenCLUpdateStateDataKernel::setPeriodicBoxVectors(ContextImpl& context, co
void OpenCLUpdateStateDataKernel::createCheckpoint(ContextImpl& context, ostream& stream) {
int version = 1;
stream.write((char*) &version, sizeof(int));
int precision = (cl.getUseDoublePrecision() ? 2 : cl.getUseMixedPrecision() ? 1 : 0);
stream.write((char*) &precision, sizeof(int));
double time = cl.getTime();
stream.write((char*) &time, sizeof(double));
int stepCount = cl.getStepCount();
......@@ -235,10 +335,14 @@ void OpenCLUpdateStateDataKernel::createCheckpoint(ContextImpl& context, ostream
int computeForceCount = cl.getComputeForceCount();
stream.write((char*) &computeForceCount, sizeof(int));
char* buffer = (char*) cl.getPinnedBuffer();
cl.getPosq().download((mm_float4*) buffer);
stream.write(buffer, sizeof(mm_float4)*cl.getPosq().getSize());
cl.getVelm().download((mm_float4*) buffer);
stream.write(buffer, sizeof(mm_float4)*cl.getVelm().getSize());
cl.getPosq().download(buffer);
stream.write(buffer, cl.getPosq().getSize()*cl.getPosq().getElementSize());
if (cl.getUseMixedPrecision()) {
cl.getPosqCorrection().download(buffer);
stream.write(buffer, cl.getPosqCorrection().getSize()*cl.getPosqCorrection().getElementSize());
}
cl.getVelm().download(buffer);
stream.write(buffer, cl.getVelm().getSize()*cl.getVelm().getElementSize());
stream.write((char*) &cl.getAtomIndex()[0], sizeof(cl_int)*cl.getAtomIndex().size());
stream.write((char*) &cl.getPosCellOffsets()[0], sizeof(mm_int4)*cl.getPosCellOffsets().size());
mm_float4 box = cl.getPeriodicBoxSize();
......@@ -252,6 +356,11 @@ void OpenCLUpdateStateDataKernel::loadCheckpoint(ContextImpl& context, istream&
stream.read((char*) &version, sizeof(int));
if (version != 1)
throw OpenMMException("Checkpoint was created with a different version of OpenMM");
int precision;
stream.read((char*) &precision, sizeof(int));
int expectedPrecision = (cl.getUseDoublePrecision() ? 2 : cl.getUseMixedPrecision() ? 1 : 0);
if (precision != expectedPrecision)
throw OpenMMException("Checkpoint was created with a different numeric precision");
double time;
stream.read((char*) &time, sizeof(double));
int stepCount, computeForceCount;
......@@ -264,9 +373,13 @@ void OpenCLUpdateStateDataKernel::loadCheckpoint(ContextImpl& context, istream&
contexts[i]->setComputeForceCount(computeForceCount);
}
char* buffer = (char*) cl.getPinnedBuffer();
stream.read(buffer, sizeof(mm_float4)*cl.getPosq().getSize());
stream.read(buffer, cl.getPosq().getSize()*cl.getPosq().getElementSize());
cl.getPosq().upload(buffer);
stream.read(buffer, sizeof(mm_float4)*cl.getVelm().getSize());
if (cl.getUseMixedPrecision()) {
stream.read(buffer, cl.getPosqCorrection().getSize()*cl.getPosqCorrection().getElementSize());
cl.getPosqCorrection().upload(buffer);
}
stream.read(buffer, cl.getVelm().getSize()*cl.getVelm().getElementSize());
cl.getVelm().upload(buffer);
stream.read((char*) &cl.getAtomIndex()[0], sizeof(cl_int)*cl.getAtomIndex().size());
cl.getAtomIndexArray().upload(cl.getAtomIndex());
......@@ -292,7 +405,8 @@ void OpenCLApplyConstraintsKernel::apply(ContextImpl& context, double tol) {
cl::Program program = cl.createProgram(OpenCLKernelSources::constraints, defines);
applyDeltasKernel = cl::Kernel(program, "applyPositionDeltas");
applyDeltasKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
applyDeltasKernel.setArg<cl::Buffer>(1, cl.getIntegrationUtilities().getPosDelta().getDeviceBuffer());
setPosqCorrectionArg(cl, applyDeltasKernel, 1);
applyDeltasKernel.setArg<cl::Buffer>(2, cl.getIntegrationUtilities().getPosDelta().getDeviceBuffer());
}
OpenCLIntegrationUtilities& integration = cl.getIntegrationUtilities();
cl.clearBuffer(integration.getPosDelta());
......@@ -4000,19 +4114,28 @@ void OpenCLIntegrateVerletStepKernel::execute(ContextImpl& context, const Verlet
kernel1.setArg<cl_int>(0, numAtoms);
kernel1.setArg<cl::Buffer>(1, cl.getIntegrationUtilities().getStepSize().getDeviceBuffer());
kernel1.setArg<cl::Buffer>(2, cl.getPosq().getDeviceBuffer());
kernel1.setArg<cl::Buffer>(3, cl.getVelm().getDeviceBuffer());
kernel1.setArg<cl::Buffer>(4, cl.getForce().getDeviceBuffer());
kernel1.setArg<cl::Buffer>(5, integration.getPosDelta().getDeviceBuffer());
setPosqCorrectionArg(cl, kernel1, 3);
kernel1.setArg<cl::Buffer>(4, cl.getVelm().getDeviceBuffer());
kernel1.setArg<cl::Buffer>(5, cl.getForce().getDeviceBuffer());
kernel1.setArg<cl::Buffer>(6, integration.getPosDelta().getDeviceBuffer());
kernel2.setArg<cl_int>(0, numAtoms);
kernel2.setArg<cl::Buffer>(1, cl.getIntegrationUtilities().getStepSize().getDeviceBuffer());
kernel2.setArg<cl::Buffer>(2, cl.getPosq().getDeviceBuffer());
kernel2.setArg<cl::Buffer>(3, cl.getVelm().getDeviceBuffer());
kernel2.setArg<cl::Buffer>(4, integration.getPosDelta().getDeviceBuffer());
setPosqCorrectionArg(cl, kernel2, 3);
kernel2.setArg<cl::Buffer>(4, cl.getVelm().getDeviceBuffer());
kernel2.setArg<cl::Buffer>(5, integration.getPosDelta().getDeviceBuffer());
}
if (dt != prevStepSize) {
if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()) {
vector<mm_double2> stepSizeVec(1);
stepSizeVec[0] = mm_double2(dt, dt);
cl.getIntegrationUtilities().getStepSize().upload(stepSizeVec);
}
else {
vector<mm_float2> stepSizeVec(1);
stepSizeVec[0] = mm_float2((cl_float) dt, (cl_float) dt);
cl.getIntegrationUtilities().getStepSize().upload(stepSizeVec);
}
prevStepSize = dt;
}
......@@ -4055,7 +4178,7 @@ void OpenCLIntegrateLangevinStepKernel::initialize(const System& system, const L
cl::Program program = cl.createProgram(OpenCLKernelSources::langevin, defines, "");
kernel1 = cl::Kernel(program, "integrateLangevinPart1");
kernel2 = cl::Kernel(program, "integrateLangevinPart2");
params = OpenCLArray::create<cl_float>(cl, 3, "langevinParams");
params = new OpenCLArray(cl, 3, cl.getUseDoublePrecision() || cl.getUseMixedPrecision() ? sizeof(cl_double) : sizeof(cl_float), "langevinParams");
prevStepSize = -1.0;
}
......@@ -4071,9 +4194,10 @@ void OpenCLIntegrateLangevinStepKernel::execute(ContextImpl& context, const Lang
kernel1.setArg<cl::Buffer>(4, integration.getStepSize().getDeviceBuffer());
kernel1.setArg<cl::Buffer>(5, integration.getRandom().getDeviceBuffer());
kernel2.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
kernel2.setArg<cl::Buffer>(1, integration.getPosDelta().getDeviceBuffer());
kernel2.setArg<cl::Buffer>(2, cl.getVelm().getDeviceBuffer());
kernel2.setArg<cl::Buffer>(3, integration.getStepSize().getDeviceBuffer());
setPosqCorrectionArg(cl, kernel2, 1);
kernel2.setArg<cl::Buffer>(2, integration.getPosDelta().getDeviceBuffer());
kernel2.setArg<cl::Buffer>(3, cl.getVelm().getDeviceBuffer());
kernel2.setArg<cl::Buffer>(4, integration.getStepSize().getDeviceBuffer());
}
double temperature = integrator.getTemperature();
double friction = integrator.getFriction();
......@@ -4086,6 +4210,16 @@ void OpenCLIntegrateLangevinStepKernel::execute(ContextImpl& context, const Lang
double vscale = exp(-stepSize/tau);
double fscale = (1-vscale)*tau;
double noisescale = sqrt(2*kT/tau)*sqrt(0.5*(1-vscale*vscale)*tau);
if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()) {
vector<cl_double> p(params->getSize());
p[0] = vscale;
p[1] = fscale;
p[2] = noisescale;
params->upload(p);
mm_double2 ss = mm_double2(0, stepSize);
integration.getStepSize().upload(&ss);
}
else {
vector<cl_float> p(params->getSize());
p[0] = (cl_float) vscale;
p[1] = (cl_float) fscale;
......@@ -4093,6 +4227,7 @@ void OpenCLIntegrateLangevinStepKernel::execute(ContextImpl& context, const Lang
params->upload(p);
mm_float2 ss = mm_float2(0, (float) stepSize);
integration.getStepSize().upload(&ss);
}
prevTemp = temperature;
prevFriction = friction;
prevStepSize = stepSize;
......@@ -4148,17 +4283,25 @@ void OpenCLIntegrateBrownianStepKernel::execute(ContextImpl& context, const Brow
kernel1.setArg<cl::Buffer>(4, cl.getVelm().getDeviceBuffer());
kernel1.setArg<cl::Buffer>(5, integration.getRandom().getDeviceBuffer());
kernel2.setArg<cl::Buffer>(1, cl.getPosq().getDeviceBuffer());
kernel2.setArg<cl::Buffer>(2, cl.getVelm().getDeviceBuffer());
kernel2.setArg<cl::Buffer>(3, integration.getPosDelta().getDeviceBuffer());
setPosqCorrectionArg(cl, kernel2, 2);
kernel2.setArg<cl::Buffer>(3, cl.getVelm().getDeviceBuffer());
kernel2.setArg<cl::Buffer>(4, integration.getPosDelta().getDeviceBuffer());
}
double temperature = integrator.getTemperature();
double friction = integrator.getFriction();
double stepSize = integrator.getStepSize();
if (temperature != prevTemp || friction != prevFriction || stepSize != prevStepSize) {
double tau = (friction == 0.0 ? 0.0 : 1.0/friction);
if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()) {
kernel1.setArg<cl_double>(0, tau*stepSize);
kernel1.setArg<cl_double>(1, sqrt(2.0f*BOLTZ*temperature*stepSize*tau));
kernel2.setArg<cl_double>(0, 1.0/stepSize);
}
else {
kernel1.setArg<cl_float>(0, (cl_float) (tau*stepSize));
kernel1.setArg<cl_float>(1, (cl_float) (sqrt(2.0f*BOLTZ*temperature*stepSize*tau)));
kernel2.setArg<cl_float>(0, (cl_float) (1.0/stepSize));
}
prevTemp = temperature;
prevFriction = friction;
prevStepSize = stepSize;
......@@ -4205,19 +4348,22 @@ void OpenCLIntegrateVariableVerletStepKernel::initialize(const System& system, c
double OpenCLIntegrateVariableVerletStepKernel::execute(ContextImpl& context, const VariableVerletIntegrator& integrator, double maxTime) {
OpenCLIntegrationUtilities& integration = cl.getIntegrationUtilities();
int numAtoms = cl.getNumAtoms();
bool useDouble = cl.getUseDoublePrecision() || cl.getUseMixedPrecision();
if (!hasInitializedKernels) {
hasInitializedKernels = true;
kernel1.setArg<cl_int>(0, numAtoms);
kernel1.setArg<cl::Buffer>(1, cl.getIntegrationUtilities().getStepSize().getDeviceBuffer());
kernel1.setArg<cl::Buffer>(2, cl.getPosq().getDeviceBuffer());
kernel1.setArg<cl::Buffer>(3, cl.getVelm().getDeviceBuffer());
kernel1.setArg<cl::Buffer>(4, cl.getForce().getDeviceBuffer());
kernel1.setArg<cl::Buffer>(5, integration.getPosDelta().getDeviceBuffer());
setPosqCorrectionArg(cl, kernel1, 3);
kernel1.setArg<cl::Buffer>(4, cl.getVelm().getDeviceBuffer());
kernel1.setArg<cl::Buffer>(5, cl.getForce().getDeviceBuffer());
kernel1.setArg<cl::Buffer>(6, integration.getPosDelta().getDeviceBuffer());
kernel2.setArg<cl_int>(0, numAtoms);
kernel2.setArg<cl::Buffer>(1, cl.getIntegrationUtilities().getStepSize().getDeviceBuffer());
kernel2.setArg<cl::Buffer>(2, cl.getPosq().getDeviceBuffer());
kernel2.setArg<cl::Buffer>(3, cl.getVelm().getDeviceBuffer());
kernel2.setArg<cl::Buffer>(4, integration.getPosDelta().getDeviceBuffer());
setPosqCorrectionArg(cl, kernel2, 3);
kernel2.setArg<cl::Buffer>(4, cl.getVelm().getDeviceBuffer());
kernel2.setArg<cl::Buffer>(5, integration.getPosDelta().getDeviceBuffer());
selectSizeKernel.setArg<cl_int>(0, numAtoms);
selectSizeKernel.setArg<cl::Buffer>(3, cl.getIntegrationUtilities().getStepSize().getDeviceBuffer());
selectSizeKernel.setArg<cl::Buffer>(4, cl.getVelm().getDeviceBuffer());
......@@ -4227,9 +4373,16 @@ double OpenCLIntegrateVariableVerletStepKernel::execute(ContextImpl& context, co
// Select the step size to use.
float maxStepSize = (float)(maxTime-cl.getTime());
selectSizeKernel.setArg<cl_float>(1, maxStepSize);
double maxStepSize = maxTime-cl.getTime();
float maxStepSizeFloat = (float) maxStepSize;
if (useDouble) {
selectSizeKernel.setArg<cl_double>(1, maxStepSize);
selectSizeKernel.setArg<cl_double>(2, integrator.getErrorTolerance());
}
else {
selectSizeKernel.setArg<cl_float>(1, maxStepSizeFloat);
selectSizeKernel.setArg<cl_float>(2, (cl_float) integrator.getErrorTolerance());
}
cl.executeKernel(selectSizeKernel, blockSize, blockSize);
// Call the first integration kernel.
......@@ -4253,12 +4406,23 @@ double OpenCLIntegrateVariableVerletStepKernel::execute(ContextImpl& context, co
// Update the time and step count.
mm_float2 stepSize;
double dt, time;
if (useDouble) {
mm_double2 stepSize;
cl.getIntegrationUtilities().getStepSize().download(&stepSize);
double dt = stepSize.y;
double time = cl.getTime()+dt;
dt = stepSize.y;
time = cl.getTime()+dt;
if (dt == maxStepSize)
time = maxTime; // Avoid round-off error
}
else {
mm_float2 stepSize;
cl.getIntegrationUtilities().getStepSize().download(&stepSize);
dt = stepSize.y;
time = cl.getTime()+dt;
if (dt == maxStepSizeFloat)
time = maxTime; // Avoid round-off error
}
cl.setTime(time);
cl.setStepCount(cl.getStepCount()+1);
return dt;
......@@ -4279,7 +4443,7 @@ void OpenCLIntegrateVariableLangevinStepKernel::initialize(const System& system,
kernel1 = cl::Kernel(program, "integrateLangevinPart1");
kernel2 = cl::Kernel(program, "integrateLangevinPart2");
selectSizeKernel = cl::Kernel(program, "selectLangevinStepSize");
params = OpenCLArray::create<cl_float>(cl, 3, "langevinParams");
params = new OpenCLArray(cl, 3, cl.getUseDoublePrecision() || cl.getUseMixedPrecision() ? sizeof(cl_double) : sizeof(cl_float), "langevinParams");
blockSize = min(256, system.getNumParticles());
blockSize = max(blockSize, params->getSize());
blockSize = min(blockSize, (int) cl.getDevice().getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>());
......@@ -4288,6 +4452,7 @@ void OpenCLIntegrateVariableLangevinStepKernel::initialize(const System& system,
double OpenCLIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, const VariableLangevinIntegrator& integrator, double maxTime) {
OpenCLIntegrationUtilities& integration = cl.getIntegrationUtilities();
int numAtoms = cl.getNumAtoms();
bool useDouble = cl.getUseDoublePrecision() || cl.getUseMixedPrecision();
if (!hasInitializedKernels) {
hasInitializedKernels = true;
kernel1.setArg<cl::Buffer>(0, cl.getVelm().getDeviceBuffer());
......@@ -4297,9 +4462,10 @@ double OpenCLIntegrateVariableLangevinStepKernel::execute(ContextImpl& context,
kernel1.setArg<cl::Buffer>(4, integration.getStepSize().getDeviceBuffer());
kernel1.setArg<cl::Buffer>(5, integration.getRandom().getDeviceBuffer());
kernel2.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
kernel2.setArg<cl::Buffer>(1, integration.getPosDelta().getDeviceBuffer());
kernel2.setArg<cl::Buffer>(2, cl.getVelm().getDeviceBuffer());
kernel2.setArg<cl::Buffer>(3, integration.getStepSize().getDeviceBuffer());
setPosqCorrectionArg(cl, kernel2, 1);
kernel2.setArg<cl::Buffer>(2, integration.getPosDelta().getDeviceBuffer());
kernel2.setArg<cl::Buffer>(3, cl.getVelm().getDeviceBuffer());
kernel2.setArg<cl::Buffer>(4, integration.getStepSize().getDeviceBuffer());
selectSizeKernel.setArg<cl::Buffer>(4, integration.getStepSize().getDeviceBuffer());
selectSizeKernel.setArg<cl::Buffer>(5, cl.getVelm().getDeviceBuffer());
selectSizeKernel.setArg<cl::Buffer>(6, cl.getForce().getDeviceBuffer());
......@@ -4310,11 +4476,20 @@ double OpenCLIntegrateVariableLangevinStepKernel::execute(ContextImpl& context,
// Select the step size to use.
float maxStepSize = (float)(maxTime-cl.getTime());
selectSizeKernel.setArg<cl_float>(0, maxStepSize);
double maxStepSize = maxTime-cl.getTime();
float maxStepSizeFloat = (float) maxStepSize;
if (useDouble) {
selectSizeKernel.setArg<cl_double>(0, maxStepSize);
selectSizeKernel.setArg<cl_double>(1, integrator.getErrorTolerance());
selectSizeKernel.setArg<cl_double>(2, integrator.getFriction() == 0.0 ? 0.0 : 1.0/integrator.getFriction());
selectSizeKernel.setArg<cl_double>(3, BOLTZ*integrator.getTemperature());
}
else {
selectSizeKernel.setArg<cl_float>(0, maxStepSizeFloat);
selectSizeKernel.setArg<cl_float>(1, (cl_float) integrator.getErrorTolerance());
selectSizeKernel.setArg<cl_float>(2, (cl_float) (integrator.getFriction() == 0.0 ? 0.0 : 1.0/integrator.getFriction()));
selectSizeKernel.setArg<cl_float>(3, (cl_float) (BOLTZ*integrator.getTemperature()));
}
cl.executeKernel(selectSizeKernel, blockSize, blockSize);
// Call the first integration kernel.
......@@ -4339,12 +4514,23 @@ double OpenCLIntegrateVariableLangevinStepKernel::execute(ContextImpl& context,
// Update the time and step count.
mm_float2 stepSize;
double dt, time;
if (useDouble) {
mm_double2 stepSize;
cl.getIntegrationUtilities().getStepSize().download(&stepSize);
double dt = stepSize.y;
double time = cl.getTime()+dt;
dt = stepSize.y;
time = cl.getTime()+dt;
if (dt == maxStepSize)
time = maxTime; // Avoid round-off error
}
else {
mm_float2 stepSize;
cl.getIntegrationUtilities().getStepSize().download(&stepSize);
dt = stepSize.y;
time = cl.getTime()+dt;
if (dt == maxStepSizeFloat)
time = maxTime; // Avoid round-off error
}
cl.setTime(time);
cl.setStepCount(cl.getStepCount()+1);
return dt;
......@@ -4352,8 +4538,8 @@ double OpenCLIntegrateVariableLangevinStepKernel::execute(ContextImpl& context,
class OpenCLIntegrateCustomStepKernel::ReorderListener : public OpenCLContext::ReorderListener {
public:
ReorderListener(OpenCLContext& cl, OpenCLParameterSet& perDofValues, vector<vector<cl_float> >& localPerDofValues, bool& deviceValuesAreCurrent) :
cl(cl), perDofValues(perDofValues), localPerDofValues(localPerDofValues), deviceValuesAreCurrent(deviceValuesAreCurrent) {
ReorderListener(OpenCLContext& cl, OpenCLParameterSet& perDofValues, vector<vector<cl_float> >& localPerDofValuesFloat, vector<vector<cl_double> >& localPerDofValuesDouble, bool& deviceValuesAreCurrent) :
cl(cl), perDofValues(perDofValues), localPerDofValuesFloat(localPerDofValuesFloat), localPerDofValuesDouble(localPerDofValuesDouble), deviceValuesAreCurrent(deviceValuesAreCurrent) {
int numAtoms = cl.getNumAtoms();
lastAtomOrder.resize(numAtoms);
for (int i = 0; i < numAtoms; i++)
......@@ -4365,21 +4551,39 @@ public:
if (perDofValues.getNumParameters() == 0)
return;
int numAtoms = cl.getNumAtoms();
const vector<int>& order = cl.getAtomIndex();
if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()) {
if (deviceValuesAreCurrent)
perDofValues.getParameterValues(localPerDofValuesDouble);
vector<vector<cl_double> > swap(3*numAtoms);
for (int i = 0; i < numAtoms; i++) {
swap[3*lastAtomOrder[i]] = localPerDofValuesDouble[3*i];
swap[3*lastAtomOrder[i]+1] = localPerDofValuesDouble[3*i+1];
swap[3*lastAtomOrder[i]+2] = localPerDofValuesDouble[3*i+2];
}
for (int i = 0; i < numAtoms; i++) {
localPerDofValuesDouble[3*i] = swap[3*order[i]];
localPerDofValuesDouble[3*i+1] = swap[3*order[i]+1];
localPerDofValuesDouble[3*i+2] = swap[3*order[i]+2];
}
perDofValues.setParameterValues(localPerDofValuesDouble);
}
else {
if (deviceValuesAreCurrent)
perDofValues.getParameterValues(localPerDofValues);
perDofValues.getParameterValues(localPerDofValuesFloat);
vector<vector<cl_float> > swap(3*numAtoms);
for (int i = 0; i < numAtoms; i++) {
swap[3*lastAtomOrder[i]] = localPerDofValues[3*i];
swap[3*lastAtomOrder[i]+1] = localPerDofValues[3*i+1];
swap[3*lastAtomOrder[i]+2] = localPerDofValues[3*i+2];
swap[3*lastAtomOrder[i]] = localPerDofValuesFloat[3*i];
swap[3*lastAtomOrder[i]+1] = localPerDofValuesFloat[3*i+1];
swap[3*lastAtomOrder[i]+2] = localPerDofValuesFloat[3*i+2];
}
const vector<cl_int>& order = cl.getAtomIndex();
for (int i = 0; i < numAtoms; i++) {
localPerDofValues[3*i] = swap[3*order[i]];
localPerDofValues[3*i+1] = swap[3*order[i]+1];
localPerDofValues[3*i+2] = swap[3*order[i]+2];
localPerDofValuesFloat[3*i] = swap[3*order[i]];
localPerDofValuesFloat[3*i+1] = swap[3*order[i]+1];
localPerDofValuesFloat[3*i+2] = swap[3*order[i]+2];
}
perDofValues.setParameterValues(localPerDofValuesFloat);
}
perDofValues.setParameterValues(localPerDofValues);
for (int i = 0; i < numAtoms; i++)
lastAtomOrder[i] = order[i];
deviceValuesAreCurrent = true;
......@@ -4387,7 +4591,8 @@ public:
private:
OpenCLContext& cl;
OpenCLParameterSet& perDofValues;
vector<vector<cl_float> >& localPerDofValues;
vector<vector<cl_float> >& localPerDofValuesFloat;
vector<vector<cl_double> >& localPerDofValuesDouble;
bool& deviceValuesAreCurrent;
vector<int> lastAtomOrder;
};
......@@ -4413,11 +4618,12 @@ void OpenCLIntegrateCustomStepKernel::initialize(const System& system, const Cus
cl.getPlatformData().initializeContexts(system);
cl.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed());
numGlobalVariables = integrator.getNumGlobalVariables();
globalValues = OpenCLArray::create<cl_float>(cl, max(1, numGlobalVariables), "globalVariables");
sumBuffer = OpenCLArray::create<cl_float>(cl, 3*system.getNumParticles(), "sumBuffer");
energy = OpenCLArray::create<cl_float>(cl, 1, "energy");
perDofValues = new OpenCLParameterSet(cl, integrator.getNumPerDofVariables(), 3*system.getNumParticles(), "perDofVariables");
cl.addReorderListener(new ReorderListener(cl, *perDofValues, localPerDofValues, deviceValuesAreCurrent));
int elementSize = (cl.getUseDoublePrecision() || cl.getUseMixedPrecision() ? sizeof(double) : sizeof(float));
globalValues = new OpenCLArray(cl, max(1, numGlobalVariables), elementSize, "globalVariables");
sumBuffer = new OpenCLArray(cl, 3*system.getNumParticles(), elementSize, "sumBuffer");
energy = new OpenCLArray(cl, 1, elementSize, "energy");
perDofValues = new OpenCLParameterSet(cl, integrator.getNumPerDofVariables(), 3*system.getNumParticles(), "perDofVariables", false, cl.getUseDoublePrecision() || cl.getUseMixedPrecision());
cl.addReorderListener(new ReorderListener(cl, *perDofValues, localPerDofValuesFloat, localPerDofValuesDouble, deviceValuesAreCurrent));
prevStepSize = -1.0;
SimTKOpenMMUtilities::setRandomNumberSeed(integrator.getRandomNumberSeed());
}
......@@ -4492,19 +4698,31 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
OpenCLIntegrationUtilities& integration = cl.getIntegrationUtilities();
int numAtoms = cl.getNumAtoms();
int numSteps = integrator.getNumComputations();
bool useDouble = cl.getUseDoublePrecision() || cl.getUseMixedPrecision();
if (!hasInitializedKernels) {
hasInitializedKernels = true;
// Initialize various data structures.
const map<string, double>& params = context.getParameters();
if (useDouble) {
contextParameterValues = OpenCLArray::create<cl_double>(cl, max(1, (int) params.size()), "contextParameters");
contextValuesDouble.resize(contextParameterValues->getSize());
for (map<string, double>::const_iterator iter = params.begin(); iter != params.end(); ++iter) {
contextValuesDouble[parameterNames.size()] = iter->second;
parameterNames.push_back(iter->first);
}
contextParameterValues->upload(contextValuesDouble);
}
else {
contextParameterValues = OpenCLArray::create<cl_float>(cl, max(1, (int) params.size()), "contextParameters");
contextValues.resize(contextParameterValues->getSize());
contextValuesFloat.resize(contextParameterValues->getSize());
for (map<string, double>::const_iterator iter = params.begin(); iter != params.end(); ++iter) {
contextValues[parameterNames.size()] = (float) iter->second;
contextValuesFloat[parameterNames.size()] = (float) iter->second;
parameterNames.push_back(iter->first);
}
contextParameterValues->upload(contextValues);
contextParameterValues->upload(contextValuesFloat);
}
kernels.resize(integrator.getNumComputations());
requiredGaussian.resize(integrator.getNumComputations(), 0);
requiredUniform.resize(integrator.getNumComputations(), 0);
......@@ -4644,7 +4862,6 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
compute << buffer.getType()<<" perDofy"<<intToString(i+1)<<" = perDofValues"<<intToString(i+1)<<"[3*index+1];\n";
compute << buffer.getType()<<" perDofz"<<intToString(i+1)<<" = perDofValues"<<intToString(i+1)<<"[3*index+2];\n";
}
string convert = (cl.getSupportsDoublePrecision() ? "convert_float4(" : "(");
int numGaussian = 0, numUniform = 0;
for (int j = step; j < numSteps && (j == step || merged[j]); j++) {
compute << "{\n";
......@@ -4653,15 +4870,15 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
if (variable[j] == "x") {
if (storePosAsDelta[j]) {
if (cl.getSupportsDoublePrecision())
compute << "posDelta[index] = convert_float4(position-convert_double4(posq[index]));\n";
compute << "posDelta[index] = convert_mixed4(convert_double4(position)-convert_double4(loadPos(posq, posqCorrection, index)));\n";
else
compute << "posDelta[index] = position-posq[index];\n";
}
else
compute << "posq[index] = " << convert << "position);\n";
compute << "storePos(posq, posqCorrection, index, position);\n";
}
else if (variable[j] == "v")
compute << "velm[index] = " << convert << "velocity);\n";
compute << "velm[index] = convert_mixed4(velocity);\n";
else {
for (int i = 0; i < (int) perDofValues->getBuffers().size(); i++) {
const OpenCLNonbondedUtilities::ParameterInfo& buffer = perDofValues->getBuffers()[i];
......@@ -4694,6 +4911,7 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
requiredUniform[step] = numUniform;
int index = 0;
kernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer());
setPosqCorrectionArg(cl, kernel, index++);
kernel.setArg<cl::Buffer>(index++, integration.getPosDelta().getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, cl.getVelm().getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, cl.getForce().getDeviceBuffer());
......@@ -4711,7 +4929,7 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
// Create a second kernel for this step that sums the values.
program = cl.createProgram(OpenCLKernelSources::customIntegrator, defines);
kernel = cl::Kernel(program, "computeSum");
kernel = cl::Kernel(program, useDouble ? "computeDoubleSum" : "computeFloatSum");
kernels[step].push_back(kernel);
index = 0;
kernel.setArg<cl::Buffer>(index++, sumBuffer->getDeviceBuffer());
......@@ -4760,6 +4978,7 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
kernels[step].push_back(kernel);
int index = 0;
kernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer());
setPosqCorrectionArg(cl, kernel, index++);
kernel.setArg<cl::Buffer>(index++, integration.getPosDelta().getDeviceBuffer());
}
}
......@@ -4767,7 +4986,7 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
// Create the kernel for summing energy.
cl::Program program = cl.createProgram(OpenCLKernelSources::customIntegrator, defines);
sumEnergyKernel = cl::Kernel(program, "computeSum");
sumEnergyKernel = cl::Kernel(program, cl.getUseDoublePrecision() ? "computeDoubleSum" : "computeFloatSum");
int index = 0;
sumEnergyKernel.setArg<cl::Buffer>(index++, cl.getEnergyBuffer().getDeviceBuffer());
sumEnergyKernel.setArg<cl::Buffer>(index++, energy->getDeviceBuffer());
......@@ -4778,26 +4997,48 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
// Make sure all values (variables, parameters, etc.) stored on the device are up to date.
if (!deviceValuesAreCurrent) {
perDofValues->setParameterValues(localPerDofValues);
if (useDouble)
perDofValues->setParameterValues(localPerDofValuesDouble);
else
perDofValues->setParameterValues(localPerDofValuesFloat);
deviceValuesAreCurrent = true;
}
localValuesAreCurrent = false;
double stepSize = integrator.getStepSize();
if (stepSize != prevStepSize) {
if (useDouble) {
mm_double2 ss = mm_double2(0, stepSize);
integration.getStepSize().upload(&ss);
}
else {
mm_float2 ss = mm_float2(0, (float) stepSize);
integration.getStepSize().upload(&ss);
}
prevStepSize = stepSize;
}
bool paramsChanged = false;
if (useDouble) {
for (int i = 0; i < (int) parameterNames.size(); i++) {
double value = context.getParameter(parameterNames[i]);
if (value != contextValuesDouble[i]) {
contextValuesDouble[i] = value;
paramsChanged = true;
}
}
if (paramsChanged)
contextParameterValues->upload(contextValuesDouble);
}
else {
for (int i = 0; i < (int) parameterNames.size(); i++) {
float value = (float) context.getParameter(parameterNames[i]);
if (value != contextValues[i]) {
contextValues[i] = value;
if (value != contextValuesFloat[i]) {
contextValuesFloat[i] = value;
paramsChanged = true;
}
}
if (paramsChanged)
contextParameterValues->upload(contextValues);
contextParameterValues->upload(contextValuesFloat);
}
// Loop over computation steps in the integrator and execute them.
......@@ -4826,7 +5067,7 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
forcesAreValid = true;
}
if (stepType[i] == CustomIntegrator::ComputePerDof && !merged[i]) {
kernels[i][0].setArg<cl_uint>(9, integration.prepareRandomNumbers(requiredGaussian[i]));
kernels[i][0].setArg<cl_uint>(10, integration.prepareRandomNumbers(requiredGaussian[i]));
if (requiredUniform[i] > 0)
cl.executeKernel(randomKernel, numAtoms);
cl.executeKernel(kernels[i][0], numAtoms);
......@@ -4837,7 +5078,7 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
cl.executeKernel(kernels[i][0], 1, 1);
}
else if (stepType[i] == CustomIntegrator::ComputeSum) {
kernels[i][0].setArg<cl_uint>(9, integration.prepareRandomNumbers(requiredGaussian[i]));
kernels[i][0].setArg<cl_uint>(10, integration.prepareRandomNumbers(requiredGaussian[i]));
if (requiredUniform[i] > 0)
cl.executeKernel(randomKernel, numAtoms);
cl.executeKernel(kernels[i][0], numAtoms);
......@@ -4875,11 +5116,21 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
void OpenCLIntegrateCustomStepKernel::recordChangedParameters(ContextImpl& context) {
if (!modifiesParameters)
return;
contextParameterValues->download(contextValues);
if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()) {
contextParameterValues->download(contextValuesDouble);
for (int i = 0; i < (int) parameterNames.size(); i++) {
double value = context.getParameter(parameterNames[i]);
if (value != contextValuesDouble[i])
context.setParameter(parameterNames[i], contextValuesDouble[i]);
}
}
else {
contextParameterValues->download(contextValuesFloat);
for (int i = 0; i < (int) parameterNames.size(); i++) {
float value = (float) context.getParameter(parameterNames[i]);
if (value != contextValues[i])
context.setParameter(parameterNames[i], contextValues[i]);
if (value != contextValuesFloat[i])
context.setParameter(parameterNames[i], contextValuesFloat[i]);
}
}
}
......@@ -4888,43 +5139,72 @@ void OpenCLIntegrateCustomStepKernel::getGlobalVariables(ContextImpl& context, v
values.resize(0);
return;
}
if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision())
globalValues->download(values);
else {
vector<cl_float> buffer;
globalValues->download(buffer);
values.resize(numGlobalVariables);
for (int i = 0; i < numGlobalVariables; i++)
values[i] = buffer[i];
}
}
void OpenCLIntegrateCustomStepKernel::setGlobalVariables(ContextImpl& context, const vector<double>& values) {
if (numGlobalVariables == 0)
return;
vector<cl_float> valuesVec(numGlobalVariables);
if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision())
globalValues->upload(values);
else {
vector<cl_float> buffer(numGlobalVariables);
for (int i = 0; i < numGlobalVariables; i++)
valuesVec[i] = (float) values[i];
globalValues->upload(valuesVec);
buffer[i] = (cl_float) values[i];
globalValues->upload(buffer);
}
}
void OpenCLIntegrateCustomStepKernel::getPerDofVariable(ContextImpl& context, int variable, vector<Vec3>& values) const {
values.resize(perDofValues->getNumObjects()/3);
const vector<int>& order = cl.getAtomIndex();
if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()) {
if (!localValuesAreCurrent) {
perDofValues->getParameterValues(localPerDofValues);
perDofValues->getParameterValues(localPerDofValuesDouble);
localValuesAreCurrent = true;
}
values.resize(perDofValues->getNumObjects()/3);
const vector<cl_int>& order = cl.getAtomIndex();
for (int i = 0; i < (int) values.size(); i++)
for (int j = 0; j < 3; j++)
values[order[i]][j] = localPerDofValues[3*i+j][variable];
values[order[i]][j] = localPerDofValuesDouble[3*i+j][variable];
}
else {
if (!localValuesAreCurrent) {
perDofValues->getParameterValues(localPerDofValuesFloat);
localValuesAreCurrent = true;
}
for (int i = 0; i < (int) values.size(); i++)
for (int j = 0; j < 3; j++)
values[order[i]][j] = localPerDofValuesFloat[3*i+j][variable];
}
}
void OpenCLIntegrateCustomStepKernel::setPerDofVariable(ContextImpl& context, int variable, const vector<Vec3>& values) {
const vector<int>& order = cl.getAtomIndex();
if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()) {
if (!localValuesAreCurrent) {
perDofValues->getParameterValues(localPerDofValues);
perDofValues->getParameterValues(localPerDofValuesDouble);
localValuesAreCurrent = true;
}
for (int i = 0; i < (int) values.size(); i++)
for (int j = 0; j < 3; j++)
localPerDofValuesDouble[3*i+j][variable] = values[order[i]][j];
}
else {
if (!localValuesAreCurrent) {
perDofValues->getParameterValues(localPerDofValuesFloat);
localValuesAreCurrent = true;
}
const vector<cl_int>& order = cl.getAtomIndex();
for (int i = 0; i < (int) values.size(); i++)
for (int j = 0; j < 3; j++)
localPerDofValues[3*i+j][variable] = (float) values[order[i]][j];
localPerDofValuesFloat[3*i+j][variable] = (float) values[order[i]][j];
}
deviceValuesAreCurrent = false;
}
......@@ -5035,14 +5315,24 @@ double OpenCLCalcKineticEnergyKernel::execute(ContextImpl& context) {
// We don't currently have a GPU kernel to do this, so we retrieve the velocities and calculate the energy
// on the CPU.
const vector<cl_int>& order = cl.getAtomIndex();
double energy = 0.0;
if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()) {
mm_double4* velm = (mm_double4*) cl.getPinnedBuffer();
cl.getVelm().download(velm);
for (size_t i = 0; i < masses.size(); ++i) {
mm_double4 v = velm[i];
energy += masses[order[i]]*(v.x*v.x+v.y*v.y+v.z*v.z);
}
}
else {
mm_float4* velm = (mm_float4*) cl.getPinnedBuffer();
cl.getVelm().download(velm);
double energy = 0.0;
const vector<cl_int>& order = cl.getAtomIndex();
for (size_t i = 0; i < masses.size(); ++i) {
mm_float4 v = velm[i];
energy += masses[order[i]]*(v.x*v.x+v.y*v.y+v.z*v.z);
}
}
return 0.5*energy;
}
......
......@@ -1145,7 +1145,10 @@ private:
OpenCLArray* uniformRandoms;
OpenCLArray* randomSeed;
OpenCLParameterSet* perDofValues;
mutable std::vector<std::vector<cl_float> > localPerDofValues;
mutable std::vector<std::vector<cl_float> > localPerDofValuesFloat;
mutable std::vector<std::vector<cl_double> > localPerDofValuesDouble;
std::vector<float> contextValuesFloat;
std::vector<double> contextValuesDouble;
std::vector<float> contextValues;
std::vector<std::vector<cl::Kernel> > kernels;
cl::Kernel sumEnergyKernel, randomKernel;
......
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Portions copyright (c) 2009-2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -32,32 +32,34 @@
using namespace OpenMM;
using namespace std;
OpenCLParameterSet::OpenCLParameterSet(OpenCLContext& context, int numParameters, int numObjects, const string& name, bool bufferPerParameter) :
OpenCLParameterSet::OpenCLParameterSet(OpenCLContext& context, int numParameters, int numObjects, const string& name, bool bufferPerParameter, bool useDoublePrecision) :
context(context), numParameters(numParameters), numObjects(numObjects), name(name) {
int params = numParameters;
int bufferCount = 0;
elementSize = (useDoublePrecision ? sizeof(double) : sizeof(float));
string elementType = (useDoublePrecision ? "double" : "float");
try {
if (!bufferPerParameter) {
while (params > 2) {
cl::Buffer* buf = new cl::Buffer(context.getContext(), CL_MEM_READ_WRITE, numObjects*sizeof(mm_float4));
cl::Buffer* buf = new cl::Buffer(context.getContext(), CL_MEM_READ_WRITE, numObjects*elementSize*4);
std::stringstream name;
name << "param" << (++bufferCount);
buffers.push_back(OpenCLNonbondedUtilities::ParameterInfo(name.str(), "float", 4, sizeof(mm_float4), *buf));
buffers.push_back(OpenCLNonbondedUtilities::ParameterInfo(name.str(), elementType, 4, elementSize*4, *buf));
params -= 4;
}
if (params > 1) {
cl::Buffer* buf = new cl::Buffer(context.getContext(), CL_MEM_READ_WRITE, numObjects*sizeof(mm_float2));
cl::Buffer* buf = new cl::Buffer(context.getContext(), CL_MEM_READ_WRITE, numObjects*elementSize*2);
std::stringstream name;
name << "param" << (++bufferCount);
buffers.push_back(OpenCLNonbondedUtilities::ParameterInfo(name.str(), "float", 2, sizeof(mm_float2), *buf));
buffers.push_back(OpenCLNonbondedUtilities::ParameterInfo(name.str(), elementType, 2, elementSize*2, *buf));
params -= 2;
}
}
while (params > 0) {
cl::Buffer* buf = new cl::Buffer(context.getContext(), CL_MEM_READ_WRITE, numObjects*sizeof(cl_float));
cl::Buffer* buf = new cl::Buffer(context.getContext(), CL_MEM_READ_WRITE, numObjects*elementSize);
std::stringstream name;
name << "param" << (++bufferCount);
buffers.push_back(OpenCLNonbondedUtilities::ParameterInfo(name.str(), "float", 1, sizeof(cl_float), *buf));
buffers.push_back(OpenCLNonbondedUtilities::ParameterInfo(name.str(), elementType, 1, elementSize, *buf));
params--;
}
}
......@@ -73,39 +75,42 @@ OpenCLParameterSet::~OpenCLParameterSet() {
delete &buffers[i].getMemory();
}
void OpenCLParameterSet::getParameterValues(vector<vector<cl_float> >& values) const {
template <class T>
void OpenCLParameterSet::getParameterValues(vector<vector<T> >& values) const {
if (sizeof(T) != elementSize)
throw OpenMMException("Called getParameterValues() with vector of wrong type");
values.resize(numObjects);
for (int i = 0; i < numObjects; i++)
values[i].resize(numParameters);
try {
int base = 0;
for (int i = 0; i < (int) buffers.size(); i++) {
if (buffers[i].getType() == "float4") {
vector<mm_float4> data(numObjects);
if (buffers[i].getSize() == 4*elementSize) {
vector<T> data(4*numObjects);
context.getQueue().enqueueReadBuffer(reinterpret_cast<cl::Buffer&>(buffers[i].getMemory()), CL_TRUE, 0, numObjects*buffers[i].getSize(), &data[0]);
for (int j = 0; j < numObjects; j++) {
values[j][base] = data[j].x;
values[j][base] = data[4*j];
if (base+1 < numParameters)
values[j][base+1] = data[j].y;
values[j][base+1] = data[4*j+1];
if (base+2 < numParameters)
values[j][base+2] = data[j].z;
values[j][base+2] = data[4*j+2];
if (base+3 < numParameters)
values[j][base+3] = data[j].w;
values[j][base+3] = data[4*j+3];
}
base += 4;
}
else if (buffers[i].getType() == "float2") {
vector<mm_float2> data(numObjects);
else if (buffers[i].getSize() == 2*elementSize) {
vector<T> data(2*numObjects);
context.getQueue().enqueueReadBuffer(reinterpret_cast<cl::Buffer&>(buffers[i].getMemory()), CL_TRUE, 0, numObjects*buffers[i].getSize(), &data[0]);
for (int j = 0; j < numObjects; j++) {
values[j][base] = data[j].x;
values[j][base] = data[2*j];
if (base+1 < numParameters)
values[j][base+1] = data[j].y;
values[j][base+1] = data[2*j+1];
}
base += 2;
}
else if (buffers[i].getType() == "float") {
vector<cl_float> data(numObjects);
else if (buffers[i].getSize() == elementSize) {
vector<T> data(numObjects);
context.getQueue().enqueueReadBuffer(reinterpret_cast<cl::Buffer&>(buffers[i].getMemory()), CL_TRUE, 0, numObjects*buffers[i].getSize(), &data[0]);
for (int j = 0; j < numObjects; j++)
values[j][base] = data[j];
......@@ -122,36 +127,39 @@ void OpenCLParameterSet::getParameterValues(vector<vector<cl_float> >& values) c
}
}
void OpenCLParameterSet::setParameterValues(const vector<vector<cl_float> >& values) {
template <class T>
void OpenCLParameterSet::setParameterValues(const vector<vector<T> >& values) {
if (sizeof(T) != elementSize)
throw OpenMMException("Called setParameterValues() with vector of wrong type");
try {
int base = 0;
for (int i = 0; i < (int) buffers.size(); i++) {
if (buffers[i].getType() == "float4") {
vector<mm_float4> data(numObjects);
if (buffers[i].getSize() == 4*elementSize) {
vector<T> data(4*numObjects);
for (int j = 0; j < numObjects; j++) {
data[j].x = values[j][base];
data[4*j] = values[j][base];
if (base+1 < numParameters)
data[j].y = values[j][base+1];
data[4*j+1] = values[j][base+1];
if (base+2 < numParameters)
data[j].z = values[j][base+2];
data[4*j+2] = values[j][base+2];
if (base+3 < numParameters)
data[j].w = values[j][base+3];
data[4*j+3] = values[j][base+3];
}
context.getQueue().enqueueWriteBuffer(reinterpret_cast<cl::Buffer&>(buffers[i].getMemory()), CL_TRUE, 0, numObjects*buffers[i].getSize(), &data[0]);
base += 4;
}
else if (buffers[i].getType() == "float2") {
vector<mm_float2> data(numObjects);
else if (buffers[i].getSize() == 2*elementSize) {
vector<T> data(2*numObjects);
for (int j = 0; j < numObjects; j++) {
data[j].x = values[j][base];
data[2*j] = values[j][base];
if (base+1 < numParameters)
data[j].y = values[j][base+1];
data[2*j+1] = values[j][base+1];
}
context.getQueue().enqueueWriteBuffer(reinterpret_cast<cl::Buffer&>(buffers[i].getMemory()), CL_TRUE, 0, numObjects*buffers[i].getSize(), &data[0]);
base += 2;
}
else if (buffers[i].getType() == "float") {
vector<cl_float> data(numObjects);
else if (buffers[i].getSize() == elementSize) {
vector<T> data(numObjects);
for (int j = 0; j < numObjects; j++)
data[j] = values[j][base];
context.getQueue().enqueueWriteBuffer(reinterpret_cast<cl::Buffer&>(buffers[i].getMemory()), CL_TRUE, 0, numObjects*buffers[i].getSize(), &data[0]);
......@@ -172,16 +180,26 @@ string OpenCLParameterSet::getParameterSuffix(int index, const std::string& extr
const string suffixes[] = {".x", ".y", ".z", ".w"};
int buffer = -1;
for (int i = 0; buffer == -1 && i < (int) buffers.size(); i++) {
if (index*sizeof(cl_float) < buffers[i].getSize())
if (index*elementSize < buffers[i].getSize())
buffer = i;
else
index -= buffers[i].getSize()/sizeof(cl_float);
index -= buffers[i].getSize()/elementSize;
}
if (buffer == -1)
throw OpenMMException("Internal error: Illegal argument to OpenCLParameterSet::getParameterSuffix() ("+name+")");
stringstream suffix;
suffix << (buffer+1) << extraSuffix;
if (buffers[buffer].getType() != "float")
if (buffers[buffer].getSize() != elementSize)
suffix << suffixes[index];
return suffix.str();
}
/**
* Define template instantiations for float and double versions of getParameterValues() and setParameterValues().
*/
namespace OpenMM {
template void OpenCLParameterSet::getParameterValues<float>(vector<vector<float> >& values) const;
template void OpenCLParameterSet::setParameterValues<float>(const vector<vector<float> >& values);
template void OpenCLParameterSet::getParameterValues<double>(vector<vector<double> >& values) const;
template void OpenCLParameterSet::setParameterValues<double>(const vector<vector<double> >& values);
}
\ No newline at end of file
......@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Portions copyright (c) 2009-2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -51,8 +51,9 @@ public:
* @param name the name of the parameter set
* @param bufferPerParameter if true, a separate cl::Buffer is created for each parameter. If false,
* multiple parameters may be combined into a single buffer.
* @param useDoublePrecision whether values should be stored as single or double precision
*/
OpenCLParameterSet(OpenCLContext& context, int numParameters, int numObjects, const std::string& name, bool bufferPerParameter=false);
OpenCLParameterSet(OpenCLContext& context, int numParameters, int numObjects, const std::string& name, bool bufferPerParameter=false, bool useDoublePrecision=false);
~OpenCLParameterSet();
/**
* Get the number of parameters.
......@@ -71,13 +72,15 @@ public:
*
* @param values on exit, values[i][j] contains the value of parameter j for object i
*/
void getParameterValues(std::vector<std::vector<cl_float> >& values) const;
template <class T>
void getParameterValues(std::vector<std::vector<T> >& values) const;
/**
* Set the values of all parameters.
*
* @param values values[i][j] contains the value of parameter j for object i
*/
void setParameterValues(const std::vector<std::vector<cl_float> >& values);
template <class T>
void setParameterValues(const std::vector<std::vector<T> >& values);
/**
* Get a set of OpenCLNonbondedUtilities::ParameterInfo objects which describe the Buffers
* containing the data.
......@@ -95,8 +98,7 @@ public:
std::string getParameterSuffix(int index, const std::string& extraSuffix = "") const;
private:
OpenCLContext& context;
int numParameters;
int numObjects;
int numParameters, numObjects, elementSize;
std::string name;
std::vector<OpenCLNonbondedUtilities::ParameterInfo> buffers;
};
......
......@@ -76,8 +76,10 @@ OpenCLPlatform::OpenCLPlatform() {
registerKernelFactory(RemoveCMMotionKernel::Name(), factory);
platformProperties.push_back(OpenCLDeviceIndex());
platformProperties.push_back(OpenCLPlatformIndex());
platformProperties.push_back(OpenCLPrecision());
setPropertyDefaultValue(OpenCLDeviceIndex(), "");
setPropertyDefaultValue(OpenCLPlatformIndex(), "");
setPropertyDefaultValue(OpenCLPrecision(), "single");
}
bool OpenCLPlatform::supportsDoublePrecision() const {
......@@ -101,7 +103,9 @@ void OpenCLPlatform::contextCreated(ContextImpl& context, const map<string, stri
getPropertyDefaultValue(OpenCLPlatformIndex()) : properties.find(OpenCLPlatformIndex())->second);
const string& devicePropValue = (properties.find(OpenCLDeviceIndex()) == properties.end() ?
getPropertyDefaultValue(OpenCLDeviceIndex()) : properties.find(OpenCLDeviceIndex())->second);
context.setPlatformData(new PlatformData(context.getSystem(), platformPropValue, devicePropValue));
string precisionPropValue = (properties.find(OpenCLPrecision()) == properties.end() ?
getPropertyDefaultValue(OpenCLPrecision()) : properties.find(OpenCLPrecision())->second);
context.setPlatformData(new PlatformData(context.getSystem(), platformPropValue, devicePropValue, precisionPropValue));
}
void OpenCLPlatform::contextDestroyed(ContextImpl& context) const {
......@@ -109,7 +113,8 @@ void OpenCLPlatform::contextDestroyed(ContextImpl& context) const {
delete data;
}
OpenCLPlatform::PlatformData::PlatformData(const System& system, const string& platformPropValue, const string& deviceIndexProperty) : removeCM(false), stepCount(0), computeForceCount(0), time(0.0) {
OpenCLPlatform::PlatformData::PlatformData(const System& system, const string& platformPropValue, const string& deviceIndexProperty,
const string& precisionProperty) : removeCM(false), stepCount(0), computeForceCount(0), time(0.0) {
int platformIndex = 0;
if (platformPropValue.length() > 0)
stringstream(platformPropValue) >> platformIndex;
......@@ -124,11 +129,11 @@ OpenCLPlatform::PlatformData::PlatformData(const System& system, const string& p
if (devices[i].length() > 0) {
unsigned int deviceIndex;
stringstream(devices[i]) >> deviceIndex;
contexts.push_back(new OpenCLContext(system, platformIndex, deviceIndex, *this));
contexts.push_back(new OpenCLContext(system, platformIndex, deviceIndex, precisionProperty, *this));
}
}
if (contexts.size() == 0)
contexts.push_back(new OpenCLContext(system, platformIndex, -1, *this));
contexts.push_back(new OpenCLContext(system, platformIndex, -1, precisionProperty, *this));
stringstream device;
for (int i = 0; i < (int) contexts.size(); i++) {
if (i > 0)
......@@ -137,6 +142,7 @@ OpenCLPlatform::PlatformData::PlatformData(const System& system, const string& p
}
propertyValues[OpenCLPlatform::OpenCLDeviceIndex()] = device.str();
propertyValues[OpenCLPlatform::OpenCLPlatformIndex()] = OpenCLExpressionUtilities::intToString(platformIndex);
propertyValues[OpenCLPlatform::OpenCLPrecision()] = precisionProperty;
contextEnergy.resize(contexts.size());
}
......
......@@ -2,17 +2,19 @@
* Apply the Andersen thermostat to adjust particle velocities.
*/
__kernel void applyAndersenThermostat(float collisionFrequency, float kT, __global float4* velm, __global const float2* restrict stepSize, __global const float4* restrict random,
__kernel void applyAndersenThermostat(float collisionFrequency, float kT, __global mixed4* velm, __global const mixed2* restrict stepSize, __global const float4* restrict random,
unsigned int randomIndex, __global const int* restrict atomGroups) {
float collisionProbability = 1.0f-exp(-collisionFrequency*stepSize[0].y);
float randomRange = erf(collisionProbability/sqrt(2.0f));
for (int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) {
float4 velocity = velm[index];
mixed4 velocity = velm[index];
float4 selectRand = random[randomIndex+atomGroups[index]];
float4 velRand = random[randomIndex+index];
float scale = (selectRand.w > -randomRange && selectRand.w < randomRange ? 0.0f : 1.0f);
float add = (1.0f-scale)*sqrt(kT*velocity.w);
velocity.xyz = scale*velocity.xyz + add*velRand.xyz;
real scale = (selectRand.w > -randomRange && selectRand.w < randomRange ? 0 : 1);
real add = (1-scale)*sqrt(kT*velocity.w);
velocity.x = scale*velocity.x + add*velRand.x;
velocity.y = scale*velocity.y + add*velRand.y;
velocity.z = scale*velocity.z + add*velRand.z;
velm[index] = velocity;
}
}
......@@ -2,13 +2,16 @@
* Perform the first step of Brownian integration.
*/
__kernel void integrateBrownianPart1(float tauDeltaT, float noiseAmplitude, __global const float4* restrict force,
__global float4* restrict posDelta, __global const float4* restrict velm, __global const float4* restrict random, unsigned int randomIndex) {
__kernel void integrateBrownianPart1(mixed tauDeltaT, mixed noiseAmplitude, __global const real4* restrict force,
__global mixed4* restrict posDelta, __global const mixed4* restrict velm, __global const float4* restrict random, unsigned int randomIndex) {
randomIndex += get_global_id(0);
for (int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) {
float invMass = velm[index].w;
if (invMass != 0.0)
posDelta[index] = (float4) (tauDeltaT*invMass*force[index].xyz + noiseAmplitude*sqrt(invMass)*random[randomIndex].xyz, 0.0f);
mixed invMass = velm[index].w;
if (invMass != 0) {
posDelta[index] = (mixed4) (tauDeltaT*invMass*force[index].x + noiseAmplitude*sqrt(invMass)*random[randomIndex].x,
tauDeltaT*invMass*force[index].y + noiseAmplitude*sqrt(invMass)*random[randomIndex].y,
tauDeltaT*invMass*force[index].z + noiseAmplitude*sqrt(invMass)*random[randomIndex].z, 0);
}
randomIndex += get_global_size(0);
}
}
......@@ -17,12 +20,29 @@ __kernel void integrateBrownianPart1(float tauDeltaT, float noiseAmplitude, __gl
* Perform the second step of Brownian integration.
*/
__kernel void integrateBrownianPart2(float oneOverDeltaT, __global float4* posq, __global float4* velm, __global const float4* restrict posDelta) {
__kernel void integrateBrownianPart2(mixed oneOverDeltaT, __global real4* posq, __global real4* posqCorrection, __global mixed4* velm, __global const mixed4* restrict posDelta) {
for (int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) {
if (velm[index].w != 0.0) {
float4 delta = posDelta[index];
velm[index].xyz = oneOverDeltaT*delta.xyz;
posq[index].xyz = posq[index].xyz + delta.xyz;
if (velm[index].w != 0) {
mixed4 delta = posDelta[index];
velm[index].x = oneOverDeltaT*delta.x;
velm[index].y = oneOverDeltaT*delta.y;
velm[index].z = oneOverDeltaT*delta.z;
#ifdef USE_MIXED_PRECISION
real4 pos1 = posq[index];
real4 pos2 = posqCorrection[index];
mixed4 pos = (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
#else
real4 pos = posq[index];
#endif
pos.x += delta.x;
pos.y += delta.y;
pos.z += delta.z;
#ifdef USE_MIXED_PRECISION
posq[index] = (real4) ((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
posqCorrection[index] = (real4) (pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
#else
posq[index] = pos;
#endif
}
}
}
mixed4 loadPos(__global const real4* restrict posq, __global const real4* restrict posqCorrection, int index) {
#ifdef USE_MIXED_PRECISION
real4 pos1 = posq[index];
real4 pos2 = posqCorrection[index];
return (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
#else
return posq[index];
#endif
}
/**
* Compute the direction each constraint is pointing in. This is called once at the beginning of constraint evaluation.
*/
__kernel void computeConstraintDirections(__global const int2* restrict constraintAtoms, __global float4* restrict constraintDistance, __global const float4* restrict atomPositions) {
__kernel void computeConstraintDirections(__global const int2* restrict constraintAtoms, __global mixed4* restrict constraintDistance, __global const real4* restrict atomPositions, __global const real4* restrict posCorrection) {
for (int index = get_global_id(0); index < NUM_CONSTRAINTS; index += get_global_size(0)) {
// Compute the direction for this constraint.
int2 atoms = constraintAtoms[index];
float4 dir = constraintDistance[index];
float4 oldPos1 = atomPositions[atoms.x];
float4 oldPos2 = atomPositions[atoms.y];
mixed4 dir = constraintDistance[index];
mixed4 oldPos1 = loadPos(atomPositions, posCorrection, atoms.x);
mixed4 oldPos2 = loadPos(atomPositions, posCorrection, atoms.y);
dir.x = oldPos1.x-oldPos2.x;
dir.y = oldPos1.y-oldPos2.y;
dir.z = oldPos1.z-oldPos2.z;
......@@ -19,8 +28,8 @@ __kernel void computeConstraintDirections(__global const int2* restrict constrai
/**
* Compute the force applied by each constraint.
*/
__kernel void computeConstraintForce(__global const int2* restrict constraintAtoms, __global const float4* restrict constraintDistance, __global const float4* restrict atomPositions,
__global const float* restrict reducedMass, __global float* restrict delta1, __global int* restrict converged, float tol, int iteration) {
__kernel void computeConstraintForce(__global const int2* restrict constraintAtoms, __global const mixed4* restrict constraintDistance, __global const mixed4* restrict atomPositions,
__global const mixed* restrict reducedMass, __global mixed* restrict delta1, __global int* restrict converged, mixed tol, int iteration) {
__local int groupConverged;
if (converged[1-iteration%2]) {
if (get_global_id(0) == 0)
......@@ -30,21 +39,21 @@ __kernel void computeConstraintForce(__global const int2* restrict constraintAto
if (get_local_id(0) == 0)
groupConverged = 1;
barrier(CLK_LOCAL_MEM_FENCE);
float lowerTol = 1.0f-2.0f*tol+tol*tol;
float upperTol = 1.0f+2.0f*tol+tol*tol;
mixed lowerTol = 1-2*tol+tol*tol;
mixed upperTol = 1+2*tol+tol*tol;
for (int index = get_global_id(0); index < NUM_CONSTRAINTS; index += get_global_size(0)) {
// Compute the force due to this constraint.
int2 atoms = constraintAtoms[index];
float4 dir = constraintDistance[index];
float4 rp_ij = atomPositions[atoms.x]-atomPositions[atoms.y];
mixed4 dir = constraintDistance[index];
mixed4 rp_ij = atomPositions[atoms.x]-atomPositions[atoms.y];
#ifndef CONSTRAIN_VELOCITIES
rp_ij.xyz += dir.xyz;
#endif
float rrpr = rp_ij.x*dir.x + rp_ij.y*dir.y + rp_ij.z*dir.z;
float d_ij2 = dir.x*dir.x + dir.y*dir.y + dir.z*dir.z;
mixed rrpr = rp_ij.x*dir.x + rp_ij.y*dir.y + rp_ij.z*dir.z;
mixed d_ij2 = dir.x*dir.x + dir.y*dir.y + dir.z*dir.z;
#ifdef CONSTRAIN_VELOCITIES
delta1[index] = -2.0f*reducedMass[index]*rrpr/d_ij2;
delta1[index] = -2*reducedMass[index]*rrpr/d_ij2;
// See whether it has converged.
......@@ -53,9 +62,9 @@ __kernel void computeConstraintForce(__global const int2* restrict constraintAto
converged[iteration%2] = 0;
}
#else
float rp2 = rp_ij.x*rp_ij.x + rp_ij.y*rp_ij.y + rp_ij.z*rp_ij.z;
float dist2 = dir.w*dir.w;
float diff = dist2 - rp2;
mixed rp2 = rp_ij.x*rp_ij.x + rp_ij.y*rp_ij.y + rp_ij.z*rp_ij.z;
mixed dist2 = dir.w*dir.w;
mixed diff = dist2 - rp2;
delta1[index] = (rrpr > d_ij2*1e-6f ? reducedMass[index]*diff/rrpr : 0.0f);
// See whether it has converged.
......@@ -71,15 +80,15 @@ __kernel void computeConstraintForce(__global const int2* restrict constraintAto
/**
* Multiply the vector of constraint forces by the constraint matrix.
*/
__kernel void multiplyByConstraintMatrix(__global const float* restrict delta1, __global float* restrict delta2, __global const int* restrict constraintMatrixColumn,
__global const float* restrict constraintMatrixValue, __global const int* restrict converged, int iteration) {
__kernel void multiplyByConstraintMatrix(__global const mixed* restrict delta1, __global mixed* restrict delta2, __global const int* restrict constraintMatrixColumn,
__global const mixed* restrict constraintMatrixValue, __global const int* restrict converged, int iteration) {
if (converged[iteration%2])
return; // The constraint iteration has already converged.
// Multiply by the inverse constraint matrix.
for (int index = get_global_id(0); index < NUM_CONSTRAINTS; index += get_global_size(0)) {
float sum = 0.0f;
mixed sum = 0;
for (int i = 0; ; i++) {
int element = index+i*NUM_CONSTRAINTS;
int column = constraintMatrixColumn[element];
......@@ -94,26 +103,26 @@ __kernel void multiplyByConstraintMatrix(__global const float* restrict delta1,
/**
* Update the atom positions based on constraint forces.
*/
__kernel void updateAtomPositions(__global const int* restrict numAtomConstraints, __global const int* restrict atomConstraints, __global const float4* restrict constraintDistance,
__global float4* restrict atomPositions, __global const float4* restrict velm, __global const float* restrict delta1, __global const float* restrict delta2, __global int* restrict converged, int iteration) {
__kernel void updateAtomPositions(__global const int* restrict numAtomConstraints, __global const int* restrict atomConstraints, __global const mixed4* restrict constraintDistance,
__global mixed4* restrict atomPositions, __global const mixed4* restrict velm, __global const mixed* restrict delta1, __global const mixed* restrict delta2, __global int* restrict converged, int iteration) {
if (get_global_id(0) == 0)
converged[1-iteration%2] = 1;
if (converged[iteration%2])
return; // The constraint iteration has already converged.
float damping = (iteration < 2 ? 0.5f : 1.0f);
mixed damping = (iteration < 2 ? 0.5f : 1.0f);
for (int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) {
// Compute the new position of this atom.
float4 atomPos = atomPositions[index];
float invMass = velm[index].w;
mixed4 atomPos = atomPositions[index];
mixed invMass = velm[index].w;
int num = numAtomConstraints[index];
for (int i = 0; i < num; i++) {
int constraint = atomConstraints[index+i*NUM_ATOMS];
bool forward = (constraint > 0);
constraint = (forward ? constraint-1 : -constraint-1);
float constraintForce = damping*invMass*delta2[constraint];
mixed constraintForce = damping*invMass*delta2[constraint];
constraintForce = (forward ? constraintForce : -constraintForce);
float4 dir = constraintDistance[constraint];
mixed4 dir = constraintDistance[constraint];
atomPos.x += constraintForce*dir.x;
atomPos.y += constraintForce*dir.y;
atomPos.z += constraintForce*dir.z;
......
__kernel void applyPositionDeltas(__global float4* restrict posq, __global float4* restrict posDelta) {
__kernel void applyPositionDeltas(__global real4* restrict posq, __global real4* restrict posqCorrection, __global mixed4* restrict posDelta) {
for (unsigned int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) {
float4 position = posq[index];
position.xyz += posDelta[index].xyz;
posq[index] = position;
#ifdef USE_MIXED_PRECISION
real4 pos1 = posq[index];
real4 pos2 = posqCorrection[index];
mixed4 pos = (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
#else
mixed4 pos = posq[index];
#endif
pos.xyz += posDelta[index].xyz;
#ifdef USE_MIXED_PRECISION
posq[index] = (real4) ((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
posqCorrection[index] = (real4) (pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
#else
posq[index] = pos;
#endif
}
}
__kernel void computeSum(__global const float* restrict sumBuffer, __global float* result, unsigned int outputIndex, int bufferSize) {
__kernel void computeFloatSum(__global const float* restrict sumBuffer, __global float* result, unsigned int outputIndex, int bufferSize) {
__local float tempBuffer[WORK_GROUP_SIZE];
const unsigned int thread = get_local_id(0);
float sum = 0.0f;
float sum = 0;
for (unsigned int index = thread; index < bufferSize; index += get_local_size(0))
sum += sumBuffer[index];
tempBuffer[thread] = sum;
......@@ -14,12 +14,41 @@ __kernel void computeSum(__global const float* restrict sumBuffer, __global floa
result[outputIndex] = tempBuffer[0];
}
__kernel void applyPositionDeltas(__global float4* restrict posq, __global float4* restrict posDelta) {
#ifdef SUPPORTS_DOUBLE_PRECISION
__kernel void computeDoubleSum(__global const double* restrict sumBuffer, __global double* result, unsigned int outputIndex, int bufferSize) {
__local double tempBuffer[WORK_GROUP_SIZE];
const unsigned int thread = get_local_id(0);
double sum = 0;
for (unsigned int index = thread; index < bufferSize; index += get_local_size(0))
sum += sumBuffer[index];
tempBuffer[thread] = sum;
for (int i = 1; i < WORK_GROUP_SIZE; i *= 2) {
barrier(CLK_LOCAL_MEM_FENCE);
if (thread%(i*2) == 0 && thread+i < WORK_GROUP_SIZE)
tempBuffer[thread] += tempBuffer[thread+i];
}
if (thread == 0)
result[outputIndex] = tempBuffer[0];
}
#endif
__kernel void applyPositionDeltas(__global real4* restrict posq, __global real4* restrict posqCorrection, __global mixed4* restrict posDelta) {
for (unsigned int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) {
float4 position = posq[index];
position.xyz += posDelta[index].xyz;
posq[index] = position;
posDelta[index] = (float4) 0.0f;
#ifdef USE_MIXED_PRECISION
real4 pos1 = posq[index];
real4 pos2 = posqCorrection[index];
mixed4 pos = (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
#else
real4 pos = posq[index];
#endif
pos.xyz += posDelta[index].xyz;
#ifdef USE_MIXED_PRECISION
posq[index] = (real4) ((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
posqCorrection[index] = (real4) (pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
#else
posq[index] = pos;
#endif
posDelta[index] = (mixed4) 0;
}
}
......
__kernel void computeGlobal(__global float2* restrict dt, __global float* restrict globals, __global float* restrict params,
float uniform, float gaussian, __global const float* restrict energy) {
__kernel void computeGlobal(__global mixed2* restrict dt, __global mixed* restrict globals, __global mixed* restrict params,
float uniform, float gaussian, __global const real* restrict energy) {
COMPUTE_STEP
}
#ifdef SUPPORTS_DOUBLE_PRECISION
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
/**
* Load the position of a particle.
*/
mixed4 loadPos(__global const real4* restrict posq, __global const real4* restrict posqCorrection, int index) {
#ifdef USE_MIXED_PRECISION
real4 pos1 = posq[index];
real4 pos2 = posqCorrection[index];
return (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
#else
return posq[index];
#endif
}
/**
* Store the position of a particle.
*/
void storePos(__global real4* restrict posq, __global real4* restrict posqCorrection, int index, mixed4 pos) {
#ifdef USE_MIXED_PRECISION
posq[index] = (real4) ((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
posqCorrection[index] = (real4) (pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
#else
posq[index] = pos;
#endif
}
__kernel void computePerDof(__global float4* restrict posq, __global float4* restrict posDelta, __global float4* restrict velm,
__global const float4* restrict force, __global const float2* restrict dt, __global const float* restrict globals,
__global const float* restrict params, __global float* restrict sum, __global const float4* restrict gaussianValues,
unsigned int randomIndex, __global const float4* restrict uniformValues, __global const float* restrict energy
__kernel void computePerDof(__global real4* restrict posq, __global real4* restrict posqCorrection, __global mixed4* restrict posDelta,
__global mixed4* restrict velm, __global const real4* restrict force, __global const mixed2* restrict dt, __global const mixed* restrict globals,
__global const mixed* restrict params, __global mixed* restrict sum, __global const float4* restrict gaussianValues,
unsigned int randomIndex, __global const float4* restrict uniformValues, __global const real* restrict energy
PARAMETER_ARGUMENTS) {
float stepSize = dt[0].y;
mixed stepSize = dt[0].y;
int index = get_global_id(0);
randomIndex += index;
while (index < NUM_ATOMS) {
#ifdef SUPPORTS_DOUBLE_PRECISION
#ifdef LOAD_POS_AS_DELTA
double4 position = convert_double4(posq[index]+posDelta[index]);
mixed4 position = loadPos(posq, posqCorrection, index)+posDelta[index];
#else
double4 position = convert_double4(posq[index]);
#endif
double4 velocity = convert_double4(velm[index]);
double4 f = convert_double4(force[index]);
double mass = 1.0/velocity.w;
#else
#ifdef LOAD_POS_AS_DELTA
float4 position = posq[index]+posDelta[index];
#else
float4 position = posq[index];
#endif
float4 velocity = velm[index];
float4 f = force[index];
float mass = 1.0f/velocity.w;
mixed4 position = loadPos(posq, posqCorrection, index);
#endif
mixed4 velocity = velm[index];
real4 f = force[index];
mixed mass = 1/velocity.w;
if (velocity.w != 0.0) {
float4 gaussian = gaussianValues[randomIndex];
float4 uniform = uniformValues[index];
......
float2 multofFloat2(float2 a, float2 b) {
return (float2) (a.x*b.x - a.y*b.y, a.x*b.y + a.y*b.x);
}
......
#ifdef SUPPORTS_DOUBLE_PRECISION
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#endif
enum {VelScale, ForceScale, NoiseScale, MaxParams};
/**
* Perform the first step of Langevin integration.
*/
__kernel void integrateLangevinPart1(__global float4* restrict velm, __global const float4* restrict force, __global float4* restrict posDelta,
__global const float* restrict paramBuffer, __global const float2* restrict dt, __global const float4* restrict random, unsigned int randomIndex) {
float vscale = paramBuffer[VelScale];
float fscale = paramBuffer[ForceScale];
float noisescale = paramBuffer[NoiseScale];
float stepSize = dt[0].y;
__kernel void integrateLangevinPart1(__global mixed4* restrict velm, __global const real4* restrict force, __global mixed4* restrict posDelta,
__global const mixed* restrict paramBuffer, __global const mixed2* restrict dt, __global const float4* restrict random, unsigned int randomIndex) {
mixed vscale = paramBuffer[VelScale];
mixed fscale = paramBuffer[ForceScale];
mixed noisescale = paramBuffer[NoiseScale];
mixed stepSize = dt[0].y;
int index = get_global_id(0);
randomIndex += index;
while (index < NUM_ATOMS) {
float4 velocity = velm[index];
mixed4 velocity = velm[index];
if (velocity.w != 0.0) {
float sqrtInvMass = sqrt(velocity.w);
velocity.xyz = vscale*velocity.xyz + fscale*velocity.w*force[index].xyz + noisescale*sqrtInvMass*random[randomIndex].xyz;
mixed sqrtInvMass = sqrt(velocity.w);
velocity.x = vscale*velocity.x + fscale*velocity.w*force[index].x + noisescale*sqrtInvMass*random[randomIndex].x;
velocity.y = vscale*velocity.y + fscale*velocity.w*force[index].y + noisescale*sqrtInvMass*random[randomIndex].y;
velocity.z = vscale*velocity.z + fscale*velocity.w*force[index].z + noisescale*sqrtInvMass*random[randomIndex].z;
velm[index] = velocity;
posDelta[index] = stepSize*velocity;
}
......@@ -33,7 +31,7 @@ __kernel void integrateLangevinPart1(__global float4* restrict velm, __global co
* Perform the second step of Langevin integration.
*/
__kernel void integrateLangevinPart2(__global float4* restrict posq, __global const float4* restrict posDelta, __global float4* restrict velm, __global const float2* restrict dt) {
__kernel void integrateLangevinPart2(__global real4* restrict posq, __global real4* restrict posqCorrection, __global const mixed4* restrict posDelta, __global mixed4* restrict velm, __global const mixed2* restrict dt) {
#ifdef SUPPORTS_DOUBLE_PRECISION
double invStepSize = 1.0/dt[0].y;
#else
......@@ -41,17 +39,28 @@ __kernel void integrateLangevinPart2(__global float4* restrict posq, __global co
#endif
int index = get_global_id(0);
while (index < NUM_ATOMS) {
float4 vel = velm[index];
mixed4 vel = velm[index];
if (vel.w != 0.0) {
float4 pos = posq[index];
float4 delta = posDelta[index];
#ifdef USE_MIXED_PRECISION
real4 pos1 = posq[index];
real4 pos2 = posqCorrection[index];
mixed4 pos = (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
#else
real4 pos = posq[index];
#endif
mixed4 delta = posDelta[index];
pos.xyz += delta.xyz;
#ifdef SUPPORTS_DOUBLE_PRECISION
vel.xyz = convert_float4(invStepSize*convert_double4(delta)).xyz;
vel.xyz = convert_mixed4(invStepSize*convert_double4(delta)).xyz;
#else
vel.xyz = invStepSize*delta.xyz;
#endif
#ifdef USE_MIXED_PRECISION
posq[index] = convert_real4(pos);
posqCorrection[index] = (real4) (pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
#else
posq[index] = pos;
#endif
velm[index] = vel;
}
index += get_global_size(0);
......@@ -62,15 +71,15 @@ __kernel void integrateLangevinPart2(__global float4* restrict posq, __global co
* Select the step size to use for the next step.
*/
__kernel void selectLangevinStepSize(float maxStepSize, float errorTol, float tau, float kT, __global float2* restrict dt,
__global const float4* restrict velm, __global const float4* restrict force, __global float* restrict paramBuffer, __local float* restrict params, __local float* restrict error) {
__kernel void selectLangevinStepSize(mixed maxStepSize, mixed errorTol, mixed tau, mixed kT, __global mixed2* restrict dt,
__global const mixed4* restrict velm, __global const real4* restrict force, __global mixed* restrict paramBuffer, __local mixed* restrict params, __local mixed* restrict error) {
// Calculate the error.
float err = 0.0f;
mixed err = 0.0f;
unsigned int index = get_local_id(0);
while (index < NUM_ATOMS) {
float4 f = force[index];
float invMass = velm[index].w;
real4 f = force[index];
mixed invMass = velm[index].w;
err += (f.x*f.x + f.y*f.y + f.z*f.z)*invMass;
index += get_global_size(0);
}
......@@ -87,9 +96,9 @@ __kernel void selectLangevinStepSize(float maxStepSize, float errorTol, float ta
if (get_global_id(0) == 0) {
// Select the new step size.
float totalError = sqrt(error[0]/(NUM_ATOMS*3));
float newStepSize = sqrt(errorTol/totalError);
float oldStepSize = dt[0].y;
mixed totalError = sqrt(error[0]/(NUM_ATOMS*3));
mixed newStepSize = sqrt(errorTol/totalError);
mixed oldStepSize = dt[0].y;
if (oldStepSize > 0.0f)
newStepSize = min(newStepSize, oldStepSize*2.0f); // For safety, limit how quickly dt can increase.
if (newStepSize > oldStepSize && newStepSize < 1.1f*oldStepSize)
......@@ -100,9 +109,9 @@ __kernel void selectLangevinStepSize(float maxStepSize, float errorTol, float ta
// Recalculate the integration parameters.
float vscale = exp(-newStepSize/tau);
float fscale = (1-vscale)*tau;
float noisescale = sqrt(2*kT/tau)*sqrt(0.5f*(1-vscale*vscale)*tau);
mixed vscale = exp(-newStepSize/tau);
mixed fscale = (1-vscale)*tau;
mixed noisescale = sqrt(2*kT/tau)*sqrt(0.5f*(1-vscale*vscale)*tau);
params[VelScale] = vscale;
params[ForceScale] = fscale;
params[NoiseScale] = noisescale;
......
......@@ -2,13 +2,16 @@
* Calculate the center of mass momentum.
*/
__kernel void calcCenterOfMassMomentum(int numAtoms, __global const float4* restrict velm, __global float4* restrict cmMomentum, __local volatile float4* restrict temp) {
__kernel void calcCenterOfMassMomentum(int numAtoms, __global const mixed4* restrict velm, __global float4* restrict cmMomentum, __local volatile float4* restrict temp) {
int index = get_global_id(0);
float4 cm = 0.0f;
while (index < numAtoms) {
float4 velocity = velm[index];
if (velocity.w != 0.0)
cm.xyz += velocity.xyz/velocity.w;
mixed4 velocity = velm[index];
if (velocity.w != 0) {
cm.x += velocity.x/velocity.w;
cm.y += velocity.y/velocity.w;
cm.z += velocity.z/velocity.w;
}
index += get_global_size(0);
}
......@@ -54,7 +57,7 @@ __kernel void calcCenterOfMassMomentum(int numAtoms, __global const float4* rest
* Remove center of mass motion.
*/
__kernel void removeCenterOfMassMomentum(unsigned int numAtoms, __global float4* restrict velm, __global const float4* restrict cmMomentum, __local volatile float4* restrict temp) {
__kernel void removeCenterOfMassMomentum(unsigned int numAtoms, __global mixed4* restrict velm, __global const float4* restrict cmMomentum, __local volatile float4* restrict temp) {
// First sum all of the momenta that were calculated by individual groups.
unsigned int index = get_local_id(0);
......@@ -101,7 +104,9 @@ __kernel void removeCenterOfMassMomentum(unsigned int numAtoms, __global float4*
index = get_global_id(0);
while (index < numAtoms) {
velm[index].xyz -= cm.xyz;
velm[index].x -= cm.x;
velm[index].y -= cm.y;
velm[index].z -= cm.z;
index += get_global_size(0);
}
}
mixed4 loadPos(__global const real4* restrict posq, __global const real4* restrict posqCorrection, int index) {
#ifdef USE_MIXED_PRECISION
real4 pos1 = posq[index];
real4 pos2 = posqCorrection[index];
return (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
#else
return posq[index];
#endif
}
/**
* Enforce constraints on SETTLE clusters
*/
__kernel void applySettle(int numClusters, float tol, __global const float4* restrict oldPos, __global float4* restrict posDelta, __global const float4* restrict velm, __global const int4* restrict clusterAtoms, __global const float2* restrict clusterParams) {
__kernel void applySettle(int numClusters, mixed tol, __global const real4* restrict oldPos, __global const real4* restrict posCorrection, __global mixed4* restrict posDelta, __global const mixed4* restrict velm, __global const int4* restrict clusterAtoms, __global const float2* restrict clusterParams) {
int index = get_global_id(0);
while (index < numClusters) {
// Load the data for this cluster.
int4 atoms = clusterAtoms[index];
float2 params = clusterParams[index];
float4 apos0 = oldPos[atoms.x];
float4 xp0 = posDelta[atoms.x];
float4 apos1 = oldPos[atoms.y];
float4 xp1 = posDelta[atoms.y];
float4 apos2 = oldPos[atoms.z];
float4 xp2 = posDelta[atoms.z];
float m0 = RECIP(velm[atoms.x].w);
float m1 = RECIP(velm[atoms.y].w);
float m2 = RECIP(velm[atoms.z].w);
mixed4 apos0 = loadPos(oldPos, posCorrection, atoms.x);
mixed4 xp0 = posDelta[atoms.x];
mixed4 apos1 = loadPos(oldPos, posCorrection, atoms.y);
mixed4 xp1 = posDelta[atoms.y];
mixed4 apos2 = loadPos(oldPos, posCorrection, atoms.z);
mixed4 xp2 = posDelta[atoms.z];
mixed m0 = 1/velm[atoms.x].w;
mixed m1 = 1/velm[atoms.y].w;
mixed m2 = 1/velm[atoms.z].w;
// Apply the SETTLE algorithm.
float xb0 = apos1.x-apos0.x;
float yb0 = apos1.y-apos0.y;
float zb0 = apos1.z-apos0.z;
float xc0 = apos2.x-apos0.x;
float yc0 = apos2.y-apos0.y;
float zc0 = apos2.z-apos0.z;
float invTotalMass = 1.0f/(m0+m1+m2);
float xcom = (xp0.x*m0 + (xb0+xp1.x)*m1 + (xc0+xp2.x)*m2) * invTotalMass;
float ycom = (xp0.y*m0 + (yb0+xp1.y)*m1 + (yc0+xp2.y)*m2) * invTotalMass;
float zcom = (xp0.z*m0 + (zb0+xp1.z)*m1 + (zc0+xp2.z)*m2) * invTotalMass;
float xa1 = xp0.x - xcom;
float ya1 = xp0.y - ycom;
float za1 = xp0.z - zcom;
float xb1 = xb0 + xp1.x - xcom;
float yb1 = yb0 + xp1.y - ycom;
float zb1 = zb0 + xp1.z - zcom;
float xc1 = xc0 + xp2.x - xcom;
float yc1 = yc0 + xp2.y - ycom;
float zc1 = zc0 + xp2.z - zcom;
float xaksZd = yb0*zc0 - zb0*yc0;
float yaksZd = zb0*xc0 - xb0*zc0;
float zaksZd = xb0*yc0 - yb0*xc0;
float xaksXd = ya1*zaksZd - za1*yaksZd;
float yaksXd = za1*xaksZd - xa1*zaksZd;
float zaksXd = xa1*yaksZd - ya1*xaksZd;
float xaksYd = yaksZd*zaksXd - zaksZd*yaksXd;
float yaksYd = zaksZd*xaksXd - xaksZd*zaksXd;
float zaksYd = xaksZd*yaksXd - yaksZd*xaksXd;
float axlng = sqrt(xaksXd*xaksXd + yaksXd*yaksXd + zaksXd*zaksXd);
float aylng = sqrt(xaksYd*xaksYd + yaksYd*yaksYd + zaksYd*zaksYd);
float azlng = sqrt(xaksZd*xaksZd + yaksZd*yaksZd + zaksZd*zaksZd);
float trns11 = xaksXd / axlng;
float trns21 = yaksXd / axlng;
float trns31 = zaksXd / axlng;
float trns12 = xaksYd / aylng;
float trns22 = yaksYd / aylng;
float trns32 = zaksYd / aylng;
float trns13 = xaksZd / azlng;
float trns23 = yaksZd / azlng;
float trns33 = zaksZd / azlng;
float xb0d = trns11*xb0 + trns21*yb0 + trns31*zb0;
float yb0d = trns12*xb0 + trns22*yb0 + trns32*zb0;
float xc0d = trns11*xc0 + trns21*yc0 + trns31*zc0;
float yc0d = trns12*xc0 + trns22*yc0 + trns32*zc0;
float za1d = trns13*xa1 + trns23*ya1 + trns33*za1;
float xb1d = trns11*xb1 + trns21*yb1 + trns31*zb1;
float yb1d = trns12*xb1 + trns22*yb1 + trns32*zb1;
float zb1d = trns13*xb1 + trns23*yb1 + trns33*zb1;
float xc1d = trns11*xc1 + trns21*yc1 + trns31*zc1;
float yc1d = trns12*xc1 + trns22*yc1 + trns32*zc1;
float zc1d = trns13*xc1 + trns23*yc1 + trns33*zc1;
mixed xb0 = apos1.x-apos0.x;
mixed yb0 = apos1.y-apos0.y;
mixed zb0 = apos1.z-apos0.z;
mixed xc0 = apos2.x-apos0.x;
mixed yc0 = apos2.y-apos0.y;
mixed zc0 = apos2.z-apos0.z;
mixed invTotalMass = 1.0f/(m0+m1+m2);
mixed xcom = (xp0.x*m0 + (xb0+xp1.x)*m1 + (xc0+xp2.x)*m2) * invTotalMass;
mixed ycom = (xp0.y*m0 + (yb0+xp1.y)*m1 + (yc0+xp2.y)*m2) * invTotalMass;
mixed zcom = (xp0.z*m0 + (zb0+xp1.z)*m1 + (zc0+xp2.z)*m2) * invTotalMass;
mixed xa1 = xp0.x - xcom;
mixed ya1 = xp0.y - ycom;
mixed za1 = xp0.z - zcom;
mixed xb1 = xb0 + xp1.x - xcom;
mixed yb1 = yb0 + xp1.y - ycom;
mixed zb1 = zb0 + xp1.z - zcom;
mixed xc1 = xc0 + xp2.x - xcom;
mixed yc1 = yc0 + xp2.y - ycom;
mixed zc1 = zc0 + xp2.z - zcom;
mixed xaksZd = yb0*zc0 - zb0*yc0;
mixed yaksZd = zb0*xc0 - xb0*zc0;
mixed zaksZd = xb0*yc0 - yb0*xc0;
mixed xaksXd = ya1*zaksZd - za1*yaksZd;
mixed yaksXd = za1*xaksZd - xa1*zaksZd;
mixed zaksXd = xa1*yaksZd - ya1*xaksZd;
mixed xaksYd = yaksZd*zaksXd - zaksZd*yaksXd;
mixed yaksYd = zaksZd*xaksXd - xaksZd*zaksXd;
mixed zaksYd = xaksZd*yaksXd - yaksZd*xaksXd;
mixed axlng = sqrt(xaksXd*xaksXd + yaksXd*yaksXd + zaksXd*zaksXd);
mixed aylng = sqrt(xaksYd*xaksYd + yaksYd*yaksYd + zaksYd*zaksYd);
mixed azlng = sqrt(xaksZd*xaksZd + yaksZd*yaksZd + zaksZd*zaksZd);
mixed trns11 = xaksXd / axlng;
mixed trns21 = yaksXd / axlng;
mixed trns31 = zaksXd / axlng;
mixed trns12 = xaksYd / aylng;
mixed trns22 = yaksYd / aylng;
mixed trns32 = zaksYd / aylng;
mixed trns13 = xaksZd / azlng;
mixed trns23 = yaksZd / azlng;
mixed trns33 = zaksZd / azlng;
mixed xb0d = trns11*xb0 + trns21*yb0 + trns31*zb0;
mixed yb0d = trns12*xb0 + trns22*yb0 + trns32*zb0;
mixed xc0d = trns11*xc0 + trns21*yc0 + trns31*zc0;
mixed yc0d = trns12*xc0 + trns22*yc0 + trns32*zc0;
mixed za1d = trns13*xa1 + trns23*ya1 + trns33*za1;
mixed xb1d = trns11*xb1 + trns21*yb1 + trns31*zb1;
mixed yb1d = trns12*xb1 + trns22*yb1 + trns32*zb1;
mixed zb1d = trns13*xb1 + trns23*yb1 + trns33*zb1;
mixed xc1d = trns11*xc1 + trns21*yc1 + trns31*zc1;
mixed yc1d = trns12*xc1 + trns22*yc1 + trns32*zc1;
mixed zc1d = trns13*xc1 + trns23*yc1 + trns33*zc1;
// --- Step2 A2' ---
float rc = 0.5*params.y;
float rb = sqrt(params.x*params.x-rc*rc);
float ra = rb*(m1+m2)*invTotalMass;
mixed rb = sqrt(params.x*params.x-rc*rc);
mixed ra = rb*(m1+m2)*invTotalMass;
rb -= ra;
float sinphi = za1d / ra;
float cosphi = sqrt(1.0f - sinphi*sinphi);
float sinpsi = (zb1d - zc1d) / (2*rc*cosphi);
float cospsi = sqrt(1.0f - sinpsi*sinpsi);
float ya2d = ra*cosphi;
float xb2d = - rc*cospsi;
float yb2d = - rb*cosphi - rc*sinpsi*sinphi;
float yc2d = - rb*cosphi + rc*sinpsi*sinphi;
float xb2d2 = xb2d*xb2d;
float hh2 = 4.0f*xb2d2 + (yb2d-yc2d)*(yb2d-yc2d) + (zb1d-zc1d)*(zb1d-zc1d);
float deltx = 2.0f*xb2d + sqrt(4.0f*xb2d2 - hh2 + params.y*params.y);
mixed sinphi = za1d / ra;
mixed cosphi = sqrt(1.0f - sinphi*sinphi);
mixed sinpsi = (zb1d - zc1d) / (2*rc*cosphi);
mixed cospsi = sqrt(1.0f - sinpsi*sinpsi);
mixed ya2d = ra*cosphi;
mixed xb2d = - rc*cospsi;
mixed yb2d = - rb*cosphi - rc*sinpsi*sinphi;
mixed yc2d = - rb*cosphi + rc*sinpsi*sinphi;
mixed xb2d2 = xb2d*xb2d;
mixed hh2 = 4.0f*xb2d2 + (yb2d-yc2d)*(yb2d-yc2d) + (zb1d-zc1d)*(zb1d-zc1d);
mixed deltx = 2.0f*xb2d + sqrt(4.0f*xb2d2 - hh2 + params.y*params.y);
xb2d -= deltx*0.5;
// --- Step3 al,be,ga ---
float alpha = (xb2d*(xb0d-xc0d) + yb0d*yb2d + yc0d*yc2d);
float beta = (xb2d*(yc0d-yb0d) + xb0d*yb2d + xc0d*yc2d);
float gamma = xb0d*yb1d - xb1d*yb0d + xc0d*yc1d - xc1d*yc0d;
mixed alpha = (xb2d*(xb0d-xc0d) + yb0d*yb2d + yc0d*yc2d);
mixed beta = (xb2d*(yc0d-yb0d) + xb0d*yb2d + xc0d*yc2d);
mixed gamma = xb0d*yb1d - xb1d*yb0d + xc0d*yc1d - xc1d*yc0d;
float al2be2 = alpha*alpha + beta*beta;
float sintheta = (alpha*gamma - beta*sqrt(al2be2 - gamma*gamma)) / al2be2;
mixed al2be2 = alpha*alpha + beta*beta;
mixed sintheta = (alpha*gamma - beta*sqrt(al2be2 - gamma*gamma)) / al2be2;
// --- Step4 A3' ---
float costheta = sqrt(1.0f - sintheta*sintheta);
float xa3d = - ya2d*sintheta;
float ya3d = ya2d*costheta;
float za3d = za1d;
float xb3d = xb2d*costheta - yb2d*sintheta;
float yb3d = xb2d*sintheta + yb2d*costheta;
float zb3d = zb1d;
float xc3d = - xb2d*costheta - yc2d*sintheta;
float yc3d = - xb2d*sintheta + yc2d*costheta;
float zc3d = zc1d;
mixed costheta = sqrt(1.0f - sintheta*sintheta);
mixed xa3d = - ya2d*sintheta;
mixed ya3d = ya2d*costheta;
mixed za3d = za1d;
mixed xb3d = xb2d*costheta - yb2d*sintheta;
mixed yb3d = xb2d*sintheta + yb2d*costheta;
mixed zb3d = zb1d;
mixed xc3d = - xb2d*costheta - yc2d*sintheta;
mixed yc3d = - xb2d*sintheta + yc2d*costheta;
mixed zc3d = zc1d;
// --- Step5 A3 ---
float xa3 = trns11*xa3d + trns12*ya3d + trns13*za3d;
float ya3 = trns21*xa3d + trns22*ya3d + trns23*za3d;
float za3 = trns31*xa3d + trns32*ya3d + trns33*za3d;
float xb3 = trns11*xb3d + trns12*yb3d + trns13*zb3d;
float yb3 = trns21*xb3d + trns22*yb3d + trns23*zb3d;
float zb3 = trns31*xb3d + trns32*yb3d + trns33*zb3d;
float xc3 = trns11*xc3d + trns12*yc3d + trns13*zc3d;
float yc3 = trns21*xc3d + trns22*yc3d + trns23*zc3d;
float zc3 = trns31*xc3d + trns32*yc3d + trns33*zc3d;
mixed xa3 = trns11*xa3d + trns12*ya3d + trns13*za3d;
mixed ya3 = trns21*xa3d + trns22*ya3d + trns23*za3d;
mixed za3 = trns31*xa3d + trns32*ya3d + trns33*za3d;
mixed xb3 = trns11*xb3d + trns12*yb3d + trns13*zb3d;
mixed yb3 = trns21*xb3d + trns22*yb3d + trns23*zb3d;
mixed zb3 = trns31*xb3d + trns32*yb3d + trns33*zb3d;
mixed xc3 = trns11*xc3d + trns12*yc3d + trns13*zc3d;
mixed yc3 = trns21*xc3d + trns22*yc3d + trns23*zc3d;
mixed zc3 = trns31*xc3d + trns32*yc3d + trns33*zc3d;
xp0.x = xcom + xa3;
xp0.y = ycom + ya3;
......@@ -155,49 +165,49 @@ __kernel void applySettle(int numClusters, float tol, __global const float4* res
* Enforce velocity constraints on SETTLE clusters
*/
__kernel void constrainVelocities(int numClusters, float tol, __global const float4* restrict oldPos, __global float4* restrict posDelta, __global float4* restrict velm, __global const int4* restrict clusterAtoms, __global const float2* restrict clusterParams) {
__kernel void constrainVelocities(int numClusters, mixed tol, __global const real4* restrict oldPos, __global const real4* restrict posCorrection, __global mixed4* restrict posDelta, __global mixed4* restrict velm, __global const int4* restrict clusterAtoms, __global const float2* restrict clusterParams) {
for (int index = get_global_id(0); index < numClusters; index += get_global_size(0)) {
// Load the data for this cluster.
int4 atoms = clusterAtoms[index];
float4 apos0 = oldPos[atoms.x];
float4 apos1 = oldPos[atoms.y];
float4 apos2 = oldPos[atoms.z];
float4 v0 = velm[atoms.x];
float4 v1 = velm[atoms.y];
float4 v2 = velm[atoms.z];
mixed4 apos0 = loadPos(oldPos, posCorrection, atoms.x);
mixed4 apos1 = loadPos(oldPos, posCorrection, atoms.y);
mixed4 apos2 = loadPos(oldPos, posCorrection, atoms.z);
mixed4 v0 = velm[atoms.x];
mixed4 v1 = velm[atoms.y];
mixed4 v2 = velm[atoms.z];
// Compute intermediate quantities: the atom masses, the bond directions, the relative velocities,
// and the angle cosines and sines.
float mA = RECIP(v0.w);
float mB = RECIP(v1.w);
float mC = RECIP(v2.w);
float4 eAB = apos1-apos0;
float4 eBC = apos2-apos1;
float4 eCA = apos0-apos2;
eAB.xyz /= SQRT(eAB.x*eAB.x + eAB.y*eAB.y + eAB.z*eAB.z);
eBC.xyz /= SQRT(eBC.x*eBC.x + eBC.y*eBC.y + eBC.z*eBC.z);
eCA.xyz /= SQRT(eCA.x*eCA.x + eCA.y*eCA.y + eCA.z*eCA.z);
float vAB = (v1.x-v0.x)*eAB.x + (v1.y-v0.y)*eAB.y + (v1.z-v0.z)*eAB.z;
float vBC = (v2.x-v1.x)*eBC.x + (v2.y-v1.y)*eBC.y + (v2.z-v1.z)*eBC.z;
float vCA = (v0.x-v2.x)*eCA.x + (v0.y-v2.y)*eCA.y + (v0.z-v2.z)*eCA.z;
float cA = -(eAB.x*eCA.x + eAB.y*eCA.y + eAB.z*eCA.z);
float cB = -(eAB.x*eBC.x + eAB.y*eBC.y + eAB.z*eBC.z);
float cC = -(eBC.x*eCA.x + eBC.y*eCA.y + eBC.z*eCA.z);
float s2A = 1-cA*cA;
float s2B = 1-cB*cB;
float s2C = 1-cC*cC;
mixed mA = 1/v0.w;
mixed mB = 1/v1.w;
mixed mC = 1/v2.w;
mixed4 eAB = apos1-apos0;
mixed4 eBC = apos2-apos1;
mixed4 eCA = apos0-apos2;
eAB.xyz /= sqrt(eAB.x*eAB.x + eAB.y*eAB.y + eAB.z*eAB.z);
eBC.xyz /= sqrt(eBC.x*eBC.x + eBC.y*eBC.y + eBC.z*eBC.z);
eCA.xyz /= sqrt(eCA.x*eCA.x + eCA.y*eCA.y + eCA.z*eCA.z);
mixed vAB = (v1.x-v0.x)*eAB.x + (v1.y-v0.y)*eAB.y + (v1.z-v0.z)*eAB.z;
mixed vBC = (v2.x-v1.x)*eBC.x + (v2.y-v1.y)*eBC.y + (v2.z-v1.z)*eBC.z;
mixed vCA = (v0.x-v2.x)*eCA.x + (v0.y-v2.y)*eCA.y + (v0.z-v2.z)*eCA.z;
mixed cA = -(eAB.x*eCA.x + eAB.y*eCA.y + eAB.z*eCA.z);
mixed cB = -(eAB.x*eBC.x + eAB.y*eBC.y + eAB.z*eBC.z);
mixed cC = -(eBC.x*eCA.x + eBC.y*eCA.y + eBC.z*eCA.z);
mixed s2A = 1-cA*cA;
mixed s2B = 1-cB*cB;
mixed s2C = 1-cC*cC;
// Solve the equations. These are different from those in the SETTLE paper (JCC 13(8), pp. 952-962, 1992), because
// in going from equations B1 to B2, they make the assumption that mB=mC (but don't bother to mention they're
// making that assumption). We allow all three atoms to have different masses.
float mABCinv = RECIP(mA*mB*mC);
float denom = (((s2A*mB+s2B*mA)*mC+(s2A*mB*mB+2*(cA*cB*cC+1)*mA*mB+s2B*mA*mA))*mC+s2C*mA*mB*(mA+mB))*mABCinv;
float tab = ((cB*cC*mA-cA*mB-cA*mC)*vCA + (cA*cC*mB-cB*mC-cB*mA)*vBC + (s2C*mA*mA*mB*mB*mABCinv+(mA+mB+mC))*vAB)/denom;
float tbc = ((cA*cB*mC-cC*mB-cC*mA)*vCA + (s2A*mB*mB*mC*mC*mABCinv+(mA+mB+mC))*vBC + (cA*cC*mB-cB*mA-cB*mC)*vAB)/denom;
float tca = ((s2B*mA*mA*mC*mC*mABCinv+(mA+mB+mC))*vCA + (cA*cB*mC-cC*mB-cC*mA)*vBC + (cB*cC*mA-cA*mB-cA*mC)*vAB)/denom;
mixed mABCinv = 1/(mA*mB*mC);
mixed denom = (((s2A*mB+s2B*mA)*mC+(s2A*mB*mB+2*(cA*cB*cC+1)*mA*mB+s2B*mA*mA))*mC+s2C*mA*mB*(mA+mB))*mABCinv;
mixed tab = ((cB*cC*mA-cA*mB-cA*mC)*vCA + (cA*cC*mB-cB*mC-cB*mA)*vBC + (s2C*mA*mA*mB*mB*mABCinv+(mA+mB+mC))*vAB)/denom;
mixed tbc = ((cA*cB*mC-cC*mB-cC*mA)*vCA + (s2A*mB*mB*mC*mC*mABCinv+(mA+mB+mC))*vBC + (cA*cC*mB-cB*mA-cB*mC)*vAB)/denom;
mixed tca = ((s2B*mA*mA*mC*mC*mABCinv+(mA+mB+mC))*vCA + (cA*cB*mC-cC*mB-cC*mA)*vBC + (cB*cC*mA-cA*mB-cA*mC)*vAB)/denom;
v0.xyz += (tab*eAB.xyz - tca*eCA.xyz)*v0.w;
v1.xyz += (tbc*eBC.xyz - tab*eAB.xyz)*v1.w;
v2.xyz += (tca*eCA.xyz - tbc*eBC.xyz)*v2.w;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment