Commit 8d6a2a01 authored by Peter Eastman's avatar Peter Eastman
Browse files

Beginnings of mixed/double precision support in OpenCL

parent a3d5f834
...@@ -68,11 +68,18 @@ public: ...@@ -68,11 +68,18 @@ public:
static const std::string key = "OpenCLPlatformIndex"; static const std::string key = "OpenCLPlatformIndex";
return key; return key;
} }
/**
* This is the name of the parameter for selecting what numerical precision to use.
*/
static const std::string& OpenCLPrecision() {
static const std::string key = "OpenCLPrecision";
return key;
}
}; };
class OPENMM_EXPORT OpenCLPlatform::PlatformData { class OPENMM_EXPORT OpenCLPlatform::PlatformData {
public: public:
PlatformData(const System& system, const std::string& platformPropValue, const std::string& deviceIndexProperty); PlatformData(const System& system, const std::string& platformPropValue, const std::string& deviceIndexProperty, const std::string& precisionProperty);
~PlatformData(); ~PlatformData();
void initializeContexts(const System& system); void initializeContexts(const System& system);
void syncContexts(); void syncContexts();
......
...@@ -65,10 +65,24 @@ static void CL_CALLBACK errorCallback(const char* errinfo, const void* private_i ...@@ -65,10 +65,24 @@ static void CL_CALLBACK errorCallback(const char* errinfo, const void* private_i
std::cerr << "OpenCL internal error: " << errinfo << std::endl; std::cerr << "OpenCL internal error: " << errinfo << std::endl;
} }
OpenCLContext::OpenCLContext(const System& system, int platformIndex, int deviceIndex, OpenCLPlatform::PlatformData& platformData) : OpenCLContext::OpenCLContext(const System& system, int platformIndex, int deviceIndex, const string& precision, OpenCLPlatform::PlatformData& platformData) :
system(system), time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), atomsWereReordered(false), posq(NULL), system(system), time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), atomsWereReordered(false), posq(NULL),
velm(NULL), forceBuffers(NULL), longForceBuffer(NULL), energyBuffer(NULL), atomIndexDevice(NULL), integration(NULL), velm(NULL), forceBuffers(NULL), longForceBuffer(NULL), energyBuffer(NULL), atomIndexDevice(NULL), integration(NULL),
bonded(NULL), nonbonded(NULL), thread(NULL) { bonded(NULL), nonbonded(NULL), thread(NULL) {
if (precision == "single") {
useDoublePrecision = false;
useMixedPrecision = false;
}
else if (precision == "mixed") {
useDoublePrecision = false;
useMixedPrecision = true;
}
else if (precision == "double") {
useDoublePrecision = true;
useMixedPrecision = false;
}
else
throw OpenMMException("Illegal value for OpenCLPrecision: "+precision);
try { try {
contextIndex = platformData.contexts.size(); contextIndex = platformData.contexts.size();
std::vector<cl::Platform> platforms; std::vector<cl::Platform> platforms;
...@@ -217,8 +231,27 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device ...@@ -217,8 +231,27 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
numThreadBlocks = numThreadBlocksPerComputeUnit*device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(); numThreadBlocks = numThreadBlocksPerComputeUnit*device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
bonded = new OpenCLBondedUtilities(*this); bonded = new OpenCLBondedUtilities(*this);
nonbonded = new OpenCLNonbondedUtilities(*this); nonbonded = new OpenCLNonbondedUtilities(*this);
posq = OpenCLArray::create<mm_float4>(*this, paddedNumAtoms, "posq"); if (useDoublePrecision) {
velm = OpenCLArray::create<mm_float4>(*this, paddedNumAtoms, "velm"); posq = OpenCLArray::create<mm_double4>(*this, paddedNumAtoms, "posq");
velm = OpenCLArray::create<mm_double4>(*this, paddedNumAtoms, "velm");
compilationDefines["USE_DOUBLE_PRECISION"] = "1";
compilationDefines["convert_real4"] = "convert_double4";
compilationDefines["convert_mixed4"] = "convert_double4";
}
else if (useMixedPrecision) {
posq = OpenCLArray::create<mm_float4>(*this, paddedNumAtoms, "posq");
posqCorrection = OpenCLArray::create<mm_float4>(*this, paddedNumAtoms, "posq");
velm = OpenCLArray::create<mm_double4>(*this, paddedNumAtoms, "velm");
compilationDefines["USE_MIXED_PRECISION"] = "1";
compilationDefines["convert_real4"] = "convert_float4";
compilationDefines["convert_mixed4"] = "convert_double4";
}
else {
posq = OpenCLArray::create<mm_float4>(*this, paddedNumAtoms, "posq");
velm = OpenCLArray::create<mm_float4>(*this, paddedNumAtoms, "velm");
compilationDefines["convert_real4"] = "convert_float4";
compilationDefines["convert_mixed4"] = "convert_float4";
}
posCellOffsets.resize(paddedNumAtoms, mm_int4(0, 0, 0, 0)); posCellOffsets.resize(paddedNumAtoms, mm_int4(0, 0, 0, 0));
} }
catch (cl::Error err) { catch (cl::Error err) {
...@@ -241,34 +274,43 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device ...@@ -241,34 +274,43 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
// Decide whether native_sqrt(), native_rsqrt(), and native_recip() are sufficiently accurate to use. // Decide whether native_sqrt(), native_rsqrt(), and native_recip() are sufficiently accurate to use.
cl::Kernel accuracyKernel(utilities, "determineNativeAccuracy"); if (!useDoublePrecision) {
OpenCLArray valuesArray(*this, 20, sizeof(mm_float8), "values"); cl::Kernel accuracyKernel(utilities, "determineNativeAccuracy");
vector<mm_float8> values(valuesArray.getSize()); OpenCLArray valuesArray(*this, 20, sizeof(mm_float8), "values");
float nextValue = 1e-4f; vector<mm_float8> values(valuesArray.getSize());
for (int i = 0; i < (int) values.size(); ++i) { float nextValue = 1e-4f;
values[i].s0 = nextValue; for (int i = 0; i < (int) values.size(); ++i) {
nextValue *= (float) M_PI; values[i].s0 = nextValue;
nextValue *= (float) M_PI;
}
valuesArray.upload(values);
accuracyKernel.setArg<cl::Buffer>(0, valuesArray.getDeviceBuffer());
accuracyKernel.setArg<cl_int>(1, values.size());
executeKernel(accuracyKernel, values.size());
valuesArray.download(values);
double maxSqrtError = 0.0, maxRsqrtError = 0.0, maxRecipError = 0.0, maxExpError = 0.0, maxLogError = 0.0;
for (int i = 0; i < (int) values.size(); ++i) {
double v = values[i].s0;
double correctSqrt = sqrt(v);
maxSqrtError = max(maxSqrtError, fabs(correctSqrt-values[i].s1)/correctSqrt);
maxRsqrtError = max(maxRsqrtError, fabs(1.0/correctSqrt-values[i].s2)*correctSqrt);
maxRecipError = max(maxRecipError, fabs(1.0/v-values[i].s3)/values[i].s3);
maxExpError = max(maxExpError, fabs(exp(v)-values[i].s4)/values[i].s4);
maxLogError = max(maxLogError, fabs(log(v)-values[i].s5)/values[i].s5);
}
compilationDefines["SQRT"] = (maxSqrtError < 1e-6) ? "native_sqrt" : "sqrt";
compilationDefines["RSQRT"] = (maxRsqrtError < 1e-6) ? "native_rsqrt" : "rsqrt";
compilationDefines["RECIP"] = (maxRecipError < 1e-6) ? "native_recip" : "1.0f/";
compilationDefines["EXP"] = (maxExpError < 1e-6) ? "native_exp" : "exp";
compilationDefines["LOG"] = (maxLogError < 1e-6) ? "native_log" : "log";
} }
valuesArray.upload(values); else {
accuracyKernel.setArg<cl::Buffer>(0, valuesArray.getDeviceBuffer()); compilationDefines["SQRT"] = "sqrt";
accuracyKernel.setArg<cl_int>(1, values.size()); compilationDefines["RSQRT"] = "rsqrt";
executeKernel(accuracyKernel, values.size()); compilationDefines["RECIP"] = "1.0/";
valuesArray.download(values); compilationDefines["EXP"] = "exp";
double maxSqrtError = 0.0, maxRsqrtError = 0.0, maxRecipError = 0.0, maxExpError = 0.0, maxLogError = 0.0; compilationDefines["LOG"] = "log";
for (int i = 0; i < (int) values.size(); ++i) {
double v = values[i].s0;
double correctSqrt = sqrt(v);
maxSqrtError = max(maxSqrtError, fabs(correctSqrt-values[i].s1)/correctSqrt);
maxRsqrtError = max(maxRsqrtError, fabs(1.0/correctSqrt-values[i].s2)*correctSqrt);
maxRecipError = max(maxRecipError, fabs(1.0/v-values[i].s3)/values[i].s3);
maxExpError = max(maxExpError, fabs(exp(v)-values[i].s4)/values[i].s4);
maxLogError = max(maxLogError, fabs(log(v)-values[i].s5)/values[i].s5);
} }
compilationDefines["SQRT"] = (maxSqrtError < 1e-6) ? "native_sqrt" : "sqrt";
compilationDefines["RSQRT"] = (maxRsqrtError < 1e-6) ? "native_rsqrt" : "rsqrt";
compilationDefines["RECIP"] = (maxRecipError < 1e-6) ? "native_recip" : "1.0f/";
compilationDefines["EXP"] = (maxExpError < 1e-6) ? "native_exp" : "exp";
compilationDefines["LOG"] = (maxLogError < 1e-6) ? "native_log" : "log";
// Create the work thread used for parallelization when running on multiple devices. // Create the work thread used for parallelization when running on multiple devices.
...@@ -311,18 +353,21 @@ OpenCLContext::~OpenCLContext() { ...@@ -311,18 +353,21 @@ OpenCLContext::~OpenCLContext() {
} }
void OpenCLContext::initialize() { void OpenCLContext::initialize() {
vector<mm_float4> v(paddedNumAtoms, mm_float4(0, 0, 0, 0));
for (int i = 0; i < numAtoms; i++) {
double mass = system.getParticleMass(i);
v[i].w = (float) (mass == 0.0 ? 0.0 : 1.0/mass);
}
velm->upload(v);
bonded->initialize(system); bonded->initialize(system);
numForceBuffers = platformData.contexts.size(); numForceBuffers = platformData.contexts.size();
numForceBuffers = std::max(numForceBuffers, bonded->getNumForceBuffers()); numForceBuffers = std::max(numForceBuffers, bonded->getNumForceBuffers());
for (int i = 0; i < (int) forces.size(); i++) for (int i = 0; i < (int) forces.size(); i++)
numForceBuffers = std::max(numForceBuffers, forces[i]->getRequiredForceBuffers()); numForceBuffers = std::max(numForceBuffers, forces[i]->getRequiredForceBuffers());
forceBuffers = OpenCLArray::create<mm_float4>(*this, paddedNumAtoms*numForceBuffers, "forceBuffers"); if (useDoublePrecision) {
forceBuffers = OpenCLArray::create<mm_double4>(*this, paddedNumAtoms*numForceBuffers, "forceBuffers");
force = OpenCLArray::create<mm_double4>(*this, &forceBuffers->getDeviceBuffer(), paddedNumAtoms, "force");
energyBuffer = OpenCLArray::create<cl_double>(*this, max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers()), "energyBuffer");
}
else {
forceBuffers = OpenCLArray::create<mm_float4>(*this, paddedNumAtoms*numForceBuffers, "forceBuffers");
force = OpenCLArray::create<mm_float4>(*this, &forceBuffers->getDeviceBuffer(), paddedNumAtoms, "force");
energyBuffer = OpenCLArray::create<cl_float>(*this, max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers()), "energyBuffer");
}
if (supports64BitGlobalAtomics) { if (supports64BitGlobalAtomics) {
longForceBuffer = OpenCLArray::create<cl_long>(*this, 3*paddedNumAtoms, "longForceBuffer"); longForceBuffer = OpenCLArray::create<cl_long>(*this, 3*paddedNumAtoms, "longForceBuffer");
reduceForcesKernel.setArg<cl::Buffer>(0, longForceBuffer->getDeviceBuffer()); reduceForcesKernel.setArg<cl::Buffer>(0, longForceBuffer->getDeviceBuffer());
...@@ -332,12 +377,18 @@ void OpenCLContext::initialize() { ...@@ -332,12 +377,18 @@ void OpenCLContext::initialize() {
addAutoclearBuffer(longForceBuffer->getDeviceBuffer(), longForceBuffer->getSize()*2); addAutoclearBuffer(longForceBuffer->getDeviceBuffer(), longForceBuffer->getSize()*2);
} }
addAutoclearBuffer(forceBuffers->getDeviceBuffer(), forceBuffers->getSize()*4); addAutoclearBuffer(forceBuffers->getDeviceBuffer(), forceBuffers->getSize()*4);
force = OpenCLArray::create<mm_float4>(*this, &forceBuffers->getDeviceBuffer(), paddedNumAtoms, "force");
energyBuffer = OpenCLArray::create<cl_float>(*this, max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers()), "energyBuffer");
addAutoclearBuffer(energyBuffer->getDeviceBuffer(), energyBuffer->getSize()); addAutoclearBuffer(energyBuffer->getDeviceBuffer(), energyBuffer->getSize());
int bufferBytes = max(posq->getSize()*sizeof(mm_float4), energyBuffer->getSize()*sizeof(cl_float)); int bufferBytes = max(posq->getSize()*posq->getElementSize(), energyBuffer->getSize()*energyBuffer->getElementSize());
pinnedBuffer = new cl::Buffer(context, CL_MEM_ALLOC_HOST_PTR, bufferBytes); pinnedBuffer = new cl::Buffer(context, CL_MEM_ALLOC_HOST_PTR, bufferBytes);
pinnedMemory = queue.enqueueMapBuffer(*pinnedBuffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, bufferBytes); pinnedMemory = queue.enqueueMapBuffer(*pinnedBuffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, bufferBytes);
for (int i = 0; i < numAtoms; i++) {
double mass = system.getParticleMass(i);
if (useDoublePrecision || useMixedPrecision)
((mm_double4*) pinnedMemory)[i] = mm_double4(0.0, 0.0, 0.0, mass == 0.0 ? 0.0 : 1.0/mass);
else
((mm_float4*) pinnedMemory)[i] = mm_float4(0.0f, 0.0f, 0.0f, mass == 0.0 ? 0.0f : (cl_float) (1.0/mass));
}
velm->upload(pinnedMemory);
atomIndexDevice = OpenCLArray::create<cl_int>(*this, paddedNumAtoms, "atomIndexDevice"); atomIndexDevice = OpenCLArray::create<cl_int>(*this, paddedNumAtoms, "atomIndexDevice");
atomIndex.resize(paddedNumAtoms); atomIndex.resize(paddedNumAtoms);
for (int i = 0; i < paddedNumAtoms; ++i) for (int i = 0; i < paddedNumAtoms; ++i)
...@@ -382,6 +433,28 @@ cl::Program OpenCLContext::createProgram(const string source, const map<string, ...@@ -382,6 +433,28 @@ cl::Program OpenCLContext::createProgram(const string source, const map<string,
} }
if (!compilationDefines.empty()) if (!compilationDefines.empty())
src << endl; src << endl;
if (supportsDoublePrecision)
src << "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
if (useDoublePrecision) {
src << "typedef double real;\n";
src << "typedef double2 real2;\n";
src << "typedef double4 real4;\n";
}
else {
src << "typedef float real;\n";
src << "typedef float2 real2;\n";
src << "typedef float4 real4;\n";
}
if (useDoublePrecision || useMixedPrecision) {
src << "typedef double mixed;\n";
src << "typedef double2 mixed2;\n";
src << "typedef double4 mixed4;\n";
}
else {
src << "typedef float mixed;\n";
src << "typedef float2 mixed2;\n";
src << "typedef float4 mixed4;\n";
}
for (map<string, string>::const_iterator iter = defines.begin(); iter != defines.end(); ++iter) { for (map<string, string>::const_iterator iter = defines.begin(); iter != defines.end(); ++iter) {
src << "#define " << iter->first; src << "#define " << iter->first;
if (!iter->second.empty()) if (!iter->second.empty())
...@@ -764,27 +837,62 @@ void OpenCLContext::validateMolecules() { ...@@ -764,27 +837,62 @@ void OpenCLContext::validateMolecules() {
// atoms to their original order, rebuild the list of identical molecules, and sort them // atoms to their original order, rebuild the list of identical molecules, and sort them
// again. // again.
vector<mm_float4> oldPosq(paddedNumAtoms);
vector<mm_float4> newPosq(paddedNumAtoms);
vector<mm_float4> oldVelm(paddedNumAtoms);
vector<mm_float4> newVelm(paddedNumAtoms);
vector<mm_int4> newCellOffsets(numAtoms); vector<mm_int4> newCellOffsets(numAtoms);
posq->download(oldPosq); if (useDoublePrecision) {
velm->download(oldVelm); vector<mm_double4> oldPosq(paddedNumAtoms);
for (int i = 0; i < numAtoms; i++) { vector<mm_double4> newPosq(paddedNumAtoms);
int index = atomIndex[i]; vector<mm_double4> oldVelm(paddedNumAtoms);
newPosq[index] = oldPosq[i]; vector<mm_double4> newVelm(paddedNumAtoms);
newVelm[index] = oldVelm[i]; posq->download(oldPosq);
newCellOffsets[index] = posCellOffsets[i]; velm->download(oldVelm);
for (int i = 0; i < numAtoms; i++) {
int index = atomIndex[i];
newPosq[index] = oldPosq[i];
newVelm[index] = oldVelm[i];
newCellOffsets[index] = posCellOffsets[i];
}
posq->upload(newPosq);
velm->upload(newVelm);
}
else if (useMixedPrecision) {
vector<mm_float4> oldPosq(paddedNumAtoms);
vector<mm_float4> newPosq(paddedNumAtoms);
vector<mm_float4> oldPosqCorrection(paddedNumAtoms);
vector<mm_float4> newPosqCorrection(paddedNumAtoms);
vector<mm_double4> oldVelm(paddedNumAtoms);
vector<mm_double4> newVelm(paddedNumAtoms);
posq->download(oldPosq);
velm->download(oldVelm);
for (int i = 0; i < numAtoms; i++) {
int index = atomIndex[i];
newPosq[index] = oldPosq[i];
newPosqCorrection[index] = oldPosqCorrection[i];
newVelm[index] = oldVelm[i];
newCellOffsets[index] = posCellOffsets[i];
}
posq->upload(newPosq);
velm->upload(newVelm);
}
else {
vector<mm_float4> oldPosq(paddedNumAtoms);
vector<mm_float4> newPosq(paddedNumAtoms);
vector<mm_float4> oldVelm(paddedNumAtoms);
vector<mm_float4> newVelm(paddedNumAtoms);
posq->download(oldPosq);
velm->download(oldVelm);
for (int i = 0; i < numAtoms; i++) {
int index = atomIndex[i];
newPosq[index] = oldPosq[i];
newVelm[index] = oldVelm[i];
newCellOffsets[index] = posCellOffsets[i];
}
posq->upload(newPosq);
velm->upload(newVelm);
} }
posq->upload(newPosq);
velm->upload(newVelm);
for (int i = 0; i < numAtoms; i++) { for (int i = 0; i < numAtoms; i++) {
atomIndex[i] = i; atomIndex[i] = i;
posCellOffsets[i] = newCellOffsets[i]; posCellOffsets[i] = newCellOffsets[i];
} }
posq->upload(newPosq);
velm->upload(newVelm);
atomIndexDevice->upload(atomIndex); atomIndexDevice->upload(atomIndex);
findMoleculeGroups(); findMoleculeGroups();
for (int i = 0; i < (int) reorderListeners.size(); i++) for (int i = 0; i < (int) reorderListeners.size(); i++)
...@@ -797,16 +905,29 @@ void OpenCLContext::reorderAtoms(bool enforcePeriodic) { ...@@ -797,16 +905,29 @@ void OpenCLContext::reorderAtoms(bool enforcePeriodic) {
if (moleculesInvalid) if (moleculesInvalid)
validateMolecules(); validateMolecules();
atomsWereReordered = true; atomsWereReordered = true;
if (useDoublePrecision)
reorderAtomsImpl<cl_double, mm_double4, cl_double, mm_double4>(enforcePeriodic);
else if (useMixedPrecision)
reorderAtomsImpl<cl_float, mm_float4, cl_double, mm_double4>(enforcePeriodic);
else
reorderAtomsImpl<cl_float, mm_float4, cl_float, mm_float4>(enforcePeriodic);
}
template <class Real, class Real4, class Mixed, class Mixed4>
void OpenCLContext::reorderAtomsImpl(bool enforcePeriodic) {
// Find the range of positions and the number of bins along each axis. // Find the range of positions and the number of bins along each axis.
vector<mm_float4> oldPosq(paddedNumAtoms); vector<Real4> oldPosq(paddedNumAtoms);
vector<mm_float4> oldVelm(paddedNumAtoms); vector<Real4> oldPosqCorrection(paddedNumAtoms);
vector<Mixed4> oldVelm(paddedNumAtoms);
posq->download(oldPosq); posq->download(oldPosq);
velm->download(oldVelm); velm->download(oldVelm);
float minx = oldPosq[0].x, maxx = oldPosq[0].x; if (useMixedPrecision)
float miny = oldPosq[0].y, maxy = oldPosq[0].y; posqCorrection->download(oldPosqCorrection);
float minz = oldPosq[0].z, maxz = oldPosq[0].z; Real minx = oldPosq[0].x, maxx = oldPosq[0].x;
Real miny = oldPosq[0].y, maxy = oldPosq[0].y;
Real minz = oldPosq[0].z, maxz = oldPosq[0].z;
if (nonbonded->getUsePeriodic()) { if (nonbonded->getUsePeriodic()) {
minx = miny = minz = 0.0; minx = miny = minz = 0.0;
maxx = periodicBoxSize.x; maxx = periodicBoxSize.x;
...@@ -815,7 +936,7 @@ void OpenCLContext::reorderAtoms(bool enforcePeriodic) { ...@@ -815,7 +936,7 @@ void OpenCLContext::reorderAtoms(bool enforcePeriodic) {
} }
else { else {
for (int i = 1; i < numAtoms; i++) { for (int i = 1; i < numAtoms; i++) {
const mm_float4& pos = oldPosq[i]; const Real4& pos = oldPosq[i];
minx = min(minx, pos.x); minx = min(minx, pos.x);
maxx = max(maxx, pos.x); maxx = max(maxx, pos.x);
miny = min(miny, pos.y); miny = min(miny, pos.y);
...@@ -828,8 +949,9 @@ void OpenCLContext::reorderAtoms(bool enforcePeriodic) { ...@@ -828,8 +949,9 @@ void OpenCLContext::reorderAtoms(bool enforcePeriodic) {
// Loop over each group of identical molecules and reorder them. // Loop over each group of identical molecules and reorder them.
vector<int> originalIndex(numAtoms); vector<int> originalIndex(numAtoms);
vector<mm_float4> newPosq(paddedNumAtoms); vector<Real4> newPosq(paddedNumAtoms);
vector<mm_float4> newVelm(paddedNumAtoms); vector<Real4> newPosqCorrection(paddedNumAtoms);
vector<Mixed4> newVelm(paddedNumAtoms);
vector<mm_int4> newCellOffsets(numAtoms); vector<mm_int4> newCellOffsets(numAtoms);
for (int group = 0; group < (int) moleculeGroups.size(); group++) { for (int group = 0; group < (int) moleculeGroups.size(); group++) {
// Find the center of each molecule. // Find the center of each molecule.
...@@ -837,15 +959,15 @@ void OpenCLContext::reorderAtoms(bool enforcePeriodic) { ...@@ -837,15 +959,15 @@ void OpenCLContext::reorderAtoms(bool enforcePeriodic) {
MoleculeGroup& mol = moleculeGroups[group]; MoleculeGroup& mol = moleculeGroups[group];
int numMolecules = mol.offsets.size(); int numMolecules = mol.offsets.size();
vector<int>& atoms = mol.atoms; vector<int>& atoms = mol.atoms;
vector<mm_float4> molPos(numMolecules); vector<Real4> molPos(numMolecules);
float invNumAtoms = 1.0f/atoms.size(); Real invNumAtoms = (Real) (1.0/atoms.size());
for (int i = 0; i < numMolecules; i++) { for (int i = 0; i < numMolecules; i++) {
molPos[i].x = 0.0f; molPos[i].x = 0.0f;
molPos[i].y = 0.0f; molPos[i].y = 0.0f;
molPos[i].z = 0.0f; molPos[i].z = 0.0f;
for (int j = 0; j < (int)atoms.size(); j++) { for (int j = 0; j < (int)atoms.size(); j++) {
int atom = atoms[j]+mol.offsets[i]; int atom = atoms[j]+mol.offsets[i];
const mm_float4& pos = oldPosq[atom]; const Real4& pos = oldPosq[atom];
molPos[i].x += pos.x; molPos[i].x += pos.x;
molPos[i].y += pos.y; molPos[i].y += pos.y;
molPos[i].z += pos.z; molPos[i].z += pos.z;
...@@ -861,9 +983,9 @@ void OpenCLContext::reorderAtoms(bool enforcePeriodic) { ...@@ -861,9 +983,9 @@ void OpenCLContext::reorderAtoms(bool enforcePeriodic) {
int xcell = (int) floor(molPos[i].x*invPeriodicBoxSize.x); int xcell = (int) floor(molPos[i].x*invPeriodicBoxSize.x);
int ycell = (int) floor(molPos[i].y*invPeriodicBoxSize.y); int ycell = (int) floor(molPos[i].y*invPeriodicBoxSize.y);
int zcell = (int) floor(molPos[i].z*invPeriodicBoxSize.z); int zcell = (int) floor(molPos[i].z*invPeriodicBoxSize.z);
float dx = xcell*periodicBoxSize.x; Real dx = xcell*periodicBoxSize.x;
float dy = ycell*periodicBoxSize.y; Real dy = ycell*periodicBoxSize.y;
float dz = zcell*periodicBoxSize.z; Real dz = zcell*periodicBoxSize.z;
if (dx != 0.0f || dy != 0.0f || dz != 0.0f) { if (dx != 0.0f || dy != 0.0f || dz != 0.0f) {
molPos[i].x -= dx; molPos[i].x -= dx;
molPos[i].y -= dy; molPos[i].y -= dy;
...@@ -871,7 +993,7 @@ void OpenCLContext::reorderAtoms(bool enforcePeriodic) { ...@@ -871,7 +993,7 @@ void OpenCLContext::reorderAtoms(bool enforcePeriodic) {
if (enforcePeriodic) { if (enforcePeriodic) {
for (int j = 0; j < (int) atoms.size(); j++) { for (int j = 0; j < (int) atoms.size(); j++) {
int atom = atoms[j]+mol.offsets[i]; int atom = atoms[j]+mol.offsets[i];
mm_float4 p = oldPosq[atom]; Real4 p = oldPosq[atom];
p.x -= dx; p.x -= dx;
p.y -= dy; p.y -= dy;
p.z -= dz; p.z -= dz;
...@@ -888,12 +1010,12 @@ void OpenCLContext::reorderAtoms(bool enforcePeriodic) { ...@@ -888,12 +1010,12 @@ void OpenCLContext::reorderAtoms(bool enforcePeriodic) {
// Select a bin for each molecule, then sort them by bin. // Select a bin for each molecule, then sort them by bin.
bool useHilbert = (numMolecules > 5000 || atoms.size() > 8); // For small systems, a simple zigzag curve works better than a Hilbert curve. bool useHilbert = (numMolecules > 5000 || atoms.size() > 8); // For small systems, a simple zigzag curve works better than a Hilbert curve.
float binWidth; Real binWidth;
if (useHilbert) if (useHilbert)
binWidth = (float)(max(max(maxx-minx, maxy-miny), maxz-minz)/255.0); binWidth = (Real) (max(max(maxx-minx, maxy-miny), maxz-minz)/255.0);
else else
binWidth = (float)(0.2*nonbonded->getCutoffDistance()); binWidth = (Real) (0.2*nonbonded->getCutoffDistance());
float invBinWidth = 1.0f/binWidth; Real invBinWidth = (Real) (1.0/binWidth);
int xbins = 1 + (int) ((maxx-minx)*invBinWidth); int xbins = 1 + (int) ((maxx-minx)*invBinWidth);
int ybins = 1 + (int) ((maxy-miny)*invBinWidth); int ybins = 1 + (int) ((maxy-miny)*invBinWidth);
vector<pair<int, int> > molBins(numMolecules); vector<pair<int, int> > molBins(numMolecules);
...@@ -928,6 +1050,8 @@ void OpenCLContext::reorderAtoms(bool enforcePeriodic) { ...@@ -928,6 +1050,8 @@ void OpenCLContext::reorderAtoms(bool enforcePeriodic) {
int newIndex = mol.offsets[i]+atoms[j]; int newIndex = mol.offsets[i]+atoms[j];
originalIndex[newIndex] = atomIndex[oldIndex]; originalIndex[newIndex] = atomIndex[oldIndex];
newPosq[newIndex] = oldPosq[oldIndex]; newPosq[newIndex] = oldPosq[oldIndex];
if (useMixedPrecision)
newPosqCorrection[newIndex] = oldPosqCorrection[oldIndex];
newVelm[newIndex] = oldVelm[oldIndex]; newVelm[newIndex] = oldVelm[oldIndex];
newCellOffsets[newIndex] = posCellOffsets[oldIndex]; newCellOffsets[newIndex] = posCellOffsets[oldIndex];
} }
...@@ -941,6 +1065,8 @@ void OpenCLContext::reorderAtoms(bool enforcePeriodic) { ...@@ -941,6 +1065,8 @@ void OpenCLContext::reorderAtoms(bool enforcePeriodic) {
posCellOffsets[i] = newCellOffsets[i]; posCellOffsets[i] = newCellOffsets[i];
} }
posq->upload(newPosq); posq->upload(newPosq);
if (useMixedPrecision)
posqCorrection->upload(newPosqCorrection);
velm->upload(newVelm); velm->upload(newVelm);
atomIndexDevice->upload(atomIndex); atomIndexDevice->upload(atomIndex);
for (int i = 0; i < (int) reorderListeners.size(); i++) for (int i = 0; i < (int) reorderListeners.size(); i++)
......
...@@ -62,7 +62,7 @@ struct mm_float2 { ...@@ -62,7 +62,7 @@ struct mm_float2 {
mm_float2(cl_float x, cl_float y) : x(x), y(y) { mm_float2(cl_float x, cl_float y) : x(x), y(y) {
} }
}; };
struct mm_float4 { struct mm_float4 {
cl_float x, y, z, w; cl_float x, y, z, w;
mm_float4() { mm_float4() {
} }
...@@ -87,6 +87,20 @@ struct mm_float16 { ...@@ -87,6 +87,20 @@ struct mm_float16 {
s8(s8), s9(s9), s10(s10), s11(s11), s12(s12), s13(s13), s14(s14), s15(15) { s8(s8), s9(s9), s10(s10), s11(s11), s12(s12), s13(s13), s14(s14), s15(15) {
} }
}; };
struct mm_double2 {
cl_double x, y;
mm_double2() {
}
mm_double2(cl_double x, cl_double y) : x(x), y(y) {
}
};
struct mm_double4 {
cl_double x, y, z, w;
mm_double4() {
}
mm_double4(cl_double x, cl_double y, cl_double z, cl_double w) : x(x), y(y), z(z), w(w) {
}
};
struct mm_ushort2 { struct mm_ushort2 {
cl_ushort x, y; cl_ushort x, y;
mm_ushort2() { mm_ushort2() {
...@@ -145,7 +159,7 @@ public: ...@@ -145,7 +159,7 @@ public:
class ReorderListener; class ReorderListener;
static const int ThreadBlockSize; static const int ThreadBlockSize;
static const int TileSize; static const int TileSize;
OpenCLContext(const System& system, int platformIndex, int deviceIndex, OpenCLPlatform::PlatformData& platformData); OpenCLContext(const System& system, int platformIndex, int deviceIndex, const std::string& precision, OpenCLPlatform::PlatformData& platformData);
~OpenCLContext(); ~OpenCLContext();
/** /**
* This is called to initialize internal data structures after all Forces in the system * This is called to initialize internal data structures after all Forces in the system
...@@ -198,6 +212,12 @@ public: ...@@ -198,6 +212,12 @@ public:
OpenCLArray& getPosq() { OpenCLArray& getPosq() {
return *posq; return *posq;
} }
/**
* Get the array which contains a correction to the position of each atom. This only exists if getUseMixedPrecision() returns true.
*/
OpenCLArray& getPosqCorrection() {
return *posqCorrection;
}
/** /**
* Get the array which contains the velocity (the xyz components) and inverse mass (the w component) of each atom. * Get the array which contains the velocity (the xyz components) and inverse mass (the w component) of each atom.
*/ */
...@@ -405,18 +425,38 @@ public: ...@@ -405,18 +425,38 @@ public:
bool getSupportsDoublePrecision() { bool getSupportsDoublePrecision() {
return supportsDoublePrecision; return supportsDoublePrecision;
} }
/**
* Get whether double precision is being used.
*/
bool getUseDoublePrecision() {
return useDoublePrecision;
}
/**
* Get whether mixed precision is being used.
*/
bool getUseMixedPrecision() {
return useMixedPrecision;
}
/** /**
* Get the size of the periodic box. * Get the size of the periodic box.
*/ */
mm_float4 getPeriodicBoxSize() const { mm_float4 getPeriodicBoxSize() const {
return periodicBoxSize; return periodicBoxSize;
} }
/**
* Get the size of the periodic box.
*/
mm_double4 getPeriodicBoxSizeDouble() const {
return periodicBoxSizeDouble;
}
/** /**
* Set the size of the periodic box. * Set the size of the periodic box.
*/ */
void setPeriodicBoxSize(double xsize, double ysize, double zsize) { void setPeriodicBoxSize(double xsize, double ysize, double zsize) {
periodicBoxSize = mm_float4((float) xsize, (float) ysize, (float) zsize, 0); periodicBoxSize = mm_float4((float) xsize, (float) ysize, (float) zsize, 0);
invPeriodicBoxSize = mm_float4((float) (1.0/xsize), (float) (1.0/ysize), (float) (1.0/zsize), 0); invPeriodicBoxSize = mm_float4((float) (1.0/xsize), (float) (1.0/ysize), (float) (1.0/zsize), 0);
periodicBoxSizeDouble = mm_double4(xsize, ysize, zsize, 0);
invPeriodicBoxSizeDouble = mm_double4(1.0/xsize, 1.0/ysize, 1.0/zsize, 0);
} }
/** /**
* Get the inverse of the size of the periodic box. * Get the inverse of the size of the periodic box.
...@@ -424,6 +464,12 @@ public: ...@@ -424,6 +464,12 @@ public:
mm_float4 getInvPeriodicBoxSize() const { mm_float4 getInvPeriodicBoxSize() const {
return invPeriodicBoxSize; return invPeriodicBoxSize;
} }
/**
* Get the inverse of the size of the periodic box.
*/
mm_double4 getInvPeriodicBoxSizeDouble() const {
return invPeriodicBoxSizeDouble;
}
/** /**
* Get the OpenCLIntegrationUtilities for this context. * Get the OpenCLIntegrationUtilities for this context.
*/ */
...@@ -502,6 +548,11 @@ private: ...@@ -502,6 +548,11 @@ private:
* of molecules and resort the atoms. * of molecules and resort the atoms.
*/ */
void validateMolecules(); void validateMolecules();
/**
* This is the internal implementation of reorderAtoms(), templatized by the numerical precision in use.
*/
template <class Real, class Real4, class Mixed, class Mixed4>
void reorderAtomsImpl(bool enforcePeriodic);
const System& system; const System& system;
double time; double time;
OpenCLPlatform::PlatformData& platformData; OpenCLPlatform::PlatformData& platformData;
...@@ -515,9 +566,9 @@ private: ...@@ -515,9 +566,9 @@ private:
int numThreadBlocks; int numThreadBlocks;
int numForceBuffers; int numForceBuffers;
int simdWidth; int simdWidth;
bool supports64BitGlobalAtomics, supportsDoublePrecision, atomsWereReordered, moleculesInvalid; bool supports64BitGlobalAtomics, supportsDoublePrecision, useDoublePrecision, useMixedPrecision, atomsWereReordered, moleculesInvalid;
mm_float4 periodicBoxSize; mm_float4 periodicBoxSize, invPeriodicBoxSize;
mm_float4 invPeriodicBoxSize; mm_double4 periodicBoxSizeDouble, invPeriodicBoxSizeDouble;
std::string defaultOptimizationOptions; std::string defaultOptimizationOptions;
std::map<std::string, std::string> compilationDefines; std::map<std::string, std::string> compilationDefines;
cl::Context context; cl::Context context;
...@@ -538,6 +589,7 @@ private: ...@@ -538,6 +589,7 @@ private:
cl::Buffer* pinnedBuffer; cl::Buffer* pinnedBuffer;
void* pinnedMemory; void* pinnedMemory;
OpenCLArray* posq; OpenCLArray* posq;
OpenCLArray* posqCorrection;
OpenCLArray* velm; OpenCLArray* velm;
OpenCLArray* force; OpenCLArray* force;
OpenCLArray* forceBuffers; OpenCLArray* forceBuffers;
......
...@@ -87,6 +87,13 @@ struct OpenCLIntegrationUtilities::ConstraintOrderer : public binary_function<in ...@@ -87,6 +87,13 @@ struct OpenCLIntegrationUtilities::ConstraintOrderer : public binary_function<in
} }
}; };
static void setPosqCorrectionArg(OpenCLContext& cl, cl::Kernel& kernel, int index) {
if (cl.getUseMixedPrecision())
kernel.setArg<cl::Buffer>(index, cl.getPosqCorrection().getDeviceBuffer());
else
kernel.setArg<void*>(index, NULL);
}
OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, const System& system) : context(context), OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, const System& system) : context(context),
posDelta(NULL), settleAtoms(NULL), settleParams(NULL), shakeAtoms(NULL), shakeParams(NULL), posDelta(NULL), settleAtoms(NULL), settleParams(NULL), shakeAtoms(NULL), shakeParams(NULL),
random(NULL), randomSeed(NULL), randomPos(0), stepSize(NULL), ccmaAtoms(NULL), ccmaDistance(NULL), random(NULL), randomSeed(NULL), randomPos(0), stepSize(NULL), ccmaAtoms(NULL), ccmaDistance(NULL),
...@@ -96,12 +103,22 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c ...@@ -96,12 +103,22 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
vsiteOutOfPlaneAtoms(NULL), vsiteOutOfPlaneWeights(NULL), hasInitializedPosConstraintKernels(false), hasInitializedVelConstraintKernels(false) { vsiteOutOfPlaneAtoms(NULL), vsiteOutOfPlaneWeights(NULL), hasInitializedPosConstraintKernels(false), hasInitializedVelConstraintKernels(false) {
// Create workspace arrays. // Create workspace arrays.
posDelta = OpenCLArray::create<mm_float4>(context, context.getPaddedNumAtoms(), "posDelta"); if (context.getUseDoublePrecision() || context.getUseMixedPrecision()) {
vector<mm_float4> deltas(posDelta->getSize(), mm_float4(0.0, 0.0, 0.0, 0.0)); posDelta = OpenCLArray::create<mm_double4>(context, context.getPaddedNumAtoms(), "posDelta");
posDelta->upload(deltas); vector<mm_double4> deltas(posDelta->getSize(), mm_double4(0.0, 0.0, 0.0, 0.0));
stepSize = OpenCLArray::create<mm_float2>(context, 1, "stepSize"); posDelta->upload(deltas);
vector<mm_float2> step(1, mm_float2(0.0f, 0.0f)); stepSize = OpenCLArray::create<mm_double2>(context, 1, "stepSize");
stepSize->upload(step); vector<mm_double2> step(1, mm_double2(0.0, 0.0));
stepSize->upload(step);
}
else {
posDelta = OpenCLArray::create<mm_float4>(context, context.getPaddedNumAtoms(), "posDelta");
vector<mm_float4> deltas(posDelta->getSize(), mm_float4(0.0f, 0.0f, 0.0f, 0.0f));
posDelta->upload(deltas);
stepSize = OpenCLArray::create<mm_float2>(context, 1, "stepSize");
vector<mm_float2> step(1, mm_float2(0.0f, 0.0f));
stepSize->upload(step);
}
// Create kernels for enforcing constraints. // Create kernels for enforcing constraints.
...@@ -458,51 +475,86 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c ...@@ -458,51 +475,86 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
// Record the CCMA data structures. // Record the CCMA data structures.
ccmaAtoms = OpenCLArray::create<mm_int2>(context, numCCMA, "CcmaAtoms"); ccmaAtoms = OpenCLArray::create<mm_int2>(context, numCCMA, "CcmaAtoms");
ccmaDistance = OpenCLArray::create<mm_float4>(context, numCCMA, "CcmaDistance");
ccmaAtomConstraints = OpenCLArray::create<cl_int>(context, numAtoms*maxAtomConstraints, "CcmaAtomConstraints"); ccmaAtomConstraints = OpenCLArray::create<cl_int>(context, numAtoms*maxAtomConstraints, "CcmaAtomConstraints");
ccmaNumAtomConstraints = OpenCLArray::create<cl_int>(context, numAtoms, "CcmaAtomConstraintsIndex"); ccmaNumAtomConstraints = OpenCLArray::create<cl_int>(context, numAtoms, "CcmaAtomConstraintsIndex");
ccmaDelta1 = OpenCLArray::create<cl_float>(context, numCCMA, "CcmaDelta1"); ccmaConstraintMatrixColumn = OpenCLArray::create<cl_int>(context, numCCMA*maxRowElements, "ConstraintMatrixColumn");
ccmaDelta2 = OpenCLArray::create<cl_float>(context, numCCMA, "CcmaDelta2");
ccmaConverged = OpenCLArray::create<cl_int>(context, 2, "CcmaConverged"); ccmaConverged = OpenCLArray::create<cl_int>(context, 2, "CcmaConverged");
ccmaConvergedBuffer = new cl::Buffer(context.getContext(), CL_MEM_ALLOC_HOST_PTR, 2*sizeof(cl_int)); ccmaConvergedBuffer = new cl::Buffer(context.getContext(), CL_MEM_ALLOC_HOST_PTR, 2*sizeof(cl_int));
ccmaConvergedMemory = (cl_int*) context.getQueue().enqueueMapBuffer(*ccmaConvergedBuffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, 2*sizeof(cl_int)); ccmaConvergedMemory = (cl_int*) context.getQueue().enqueueMapBuffer(*ccmaConvergedBuffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, 2*sizeof(cl_int));
ccmaReducedMass = OpenCLArray::create<cl_float>(context, numCCMA, "CcmaReducedMass");
ccmaConstraintMatrixColumn = OpenCLArray::create<cl_int>(context, numCCMA*maxRowElements, "ConstraintMatrixColumn");
ccmaConstraintMatrixValue = OpenCLArray::create<cl_float>(context, numCCMA*maxRowElements, "ConstraintMatrixValue");
vector<mm_int2> atomsVec(ccmaAtoms->getSize()); vector<mm_int2> atomsVec(ccmaAtoms->getSize());
vector<mm_float4> distanceVec(ccmaDistance->getSize());
vector<cl_int> atomConstraintsVec(ccmaAtomConstraints->getSize()); vector<cl_int> atomConstraintsVec(ccmaAtomConstraints->getSize());
vector<cl_int> numAtomConstraintsVec(ccmaNumAtomConstraints->getSize()); vector<cl_int> numAtomConstraintsVec(ccmaNumAtomConstraints->getSize());
vector<cl_float> reducedMassVec(ccmaReducedMass->getSize());
vector<cl_int> constraintMatrixColumnVec(ccmaConstraintMatrixColumn->getSize()); vector<cl_int> constraintMatrixColumnVec(ccmaConstraintMatrixColumn->getSize());
vector<cl_float> constraintMatrixValueVec(ccmaConstraintMatrixValue->getSize()); if (context.getUseDoublePrecision() || context.getUseMixedPrecision()) {
for (int i = 0; i < numCCMA; i++) { ccmaDistance = OpenCLArray::create<mm_double4>(context, numCCMA, "CcmaDistance");
int index = constraintOrder[i]; ccmaDelta1 = OpenCLArray::create<cl_double>(context, numCCMA, "CcmaDelta1");
int c = ccmaConstraints[index]; ccmaDelta2 = OpenCLArray::create<cl_double>(context, numCCMA, "CcmaDelta2");
atomsVec[i].x = atom1[c]; ccmaReducedMass = OpenCLArray::create<cl_double>(context, numCCMA, "CcmaReducedMass");
atomsVec[i].y = atom2[c]; ccmaConstraintMatrixValue = OpenCLArray::create<cl_double>(context, numCCMA*maxRowElements, "ConstraintMatrixValue");
distanceVec[i].w = (float) distance[c]; vector<mm_double4> distanceVec(ccmaDistance->getSize());
reducedMassVec[i] = (float) (0.5/(1.0/system.getParticleMass(atom1[c])+1.0/system.getParticleMass(atom2[c]))); vector<cl_double> reducedMassVec(ccmaReducedMass->getSize());
for (unsigned int j = 0; j < matrix[index].size(); j++) { vector<cl_double> constraintMatrixValueVec(ccmaConstraintMatrixValue->getSize());
constraintMatrixColumnVec[i+j*numCCMA] = matrix[index][j].first; for (int i = 0; i < numCCMA; i++) {
constraintMatrixValueVec[i+j*numCCMA] = (float) matrix[index][j].second; int index = constraintOrder[i];
int c = ccmaConstraints[index];
atomsVec[i].x = atom1[c];
atomsVec[i].y = atom2[c];
distanceVec[i].w = distance[c];
reducedMassVec[i] = (0.5/(1.0/system.getParticleMass(atom1[c])+1.0/system.getParticleMass(atom2[c])));
for (unsigned int j = 0; j < matrix[index].size(); j++) {
constraintMatrixColumnVec[i+j*numCCMA] = matrix[index][j].first;
constraintMatrixValueVec[i+j*numCCMA] = matrix[index][j].second;
}
constraintMatrixColumnVec[i+matrix[index].size()*numCCMA] = numCCMA;
}
for (unsigned int i = 0; i < atomConstraints.size(); i++) {
numAtomConstraintsVec[i] = atomConstraints[i].size();
for (unsigned int j = 0; j < atomConstraints[i].size(); j++) {
bool forward = (atom1[ccmaConstraints[atomConstraints[i][j]]] == i);
atomConstraintsVec[i+j*numAtoms] = (forward ? inverseOrder[atomConstraints[i][j]]+1 : -inverseOrder[atomConstraints[i][j]]-1);
}
} }
constraintMatrixColumnVec[i+matrix[index].size()*numCCMA] = numCCMA; ccmaDistance->upload(distanceVec);
ccmaReducedMass->upload(reducedMassVec);
ccmaConstraintMatrixValue->upload(constraintMatrixValueVec);
} }
for (unsigned int i = 0; i < atomConstraints.size(); i++) { else {
numAtomConstraintsVec[i] = atomConstraints[i].size(); ccmaDistance = OpenCLArray::create<mm_float4>(context, numCCMA, "CcmaDistance");
for (unsigned int j = 0; j < atomConstraints[i].size(); j++) { ccmaDelta1 = OpenCLArray::create<cl_float>(context, numCCMA, "CcmaDelta1");
bool forward = (atom1[ccmaConstraints[atomConstraints[i][j]]] == i); ccmaDelta2 = OpenCLArray::create<cl_float>(context, numCCMA, "CcmaDelta2");
atomConstraintsVec[i+j*numAtoms] = (forward ? inverseOrder[atomConstraints[i][j]]+1 : -inverseOrder[atomConstraints[i][j]]-1); ccmaReducedMass = OpenCLArray::create<cl_float>(context, numCCMA, "CcmaReducedMass");
ccmaConstraintMatrixValue = OpenCLArray::create<cl_float>(context, numCCMA*maxRowElements, "ConstraintMatrixValue");
vector<mm_float4> distanceVec(ccmaDistance->getSize());
vector<cl_float> reducedMassVec(ccmaReducedMass->getSize());
vector<cl_float> constraintMatrixValueVec(ccmaConstraintMatrixValue->getSize());
for (int i = 0; i < numCCMA; i++) {
int index = constraintOrder[i];
int c = ccmaConstraints[index];
atomsVec[i].x = atom1[c];
atomsVec[i].y = atom2[c];
distanceVec[i].w = (float) distance[c];
reducedMassVec[i] = (float) (0.5/(1.0/system.getParticleMass(atom1[c])+1.0/system.getParticleMass(atom2[c])));
for (unsigned int j = 0; j < matrix[index].size(); j++) {
constraintMatrixColumnVec[i+j*numCCMA] = matrix[index][j].first;
constraintMatrixValueVec[i+j*numCCMA] = (float) matrix[index][j].second;
}
constraintMatrixColumnVec[i+matrix[index].size()*numCCMA] = numCCMA;
} }
for (unsigned int i = 0; i < atomConstraints.size(); i++) {
numAtomConstraintsVec[i] = atomConstraints[i].size();
for (unsigned int j = 0; j < atomConstraints[i].size(); j++) {
bool forward = (atom1[ccmaConstraints[atomConstraints[i][j]]] == i);
atomConstraintsVec[i+j*numAtoms] = (forward ? inverseOrder[atomConstraints[i][j]]+1 : -inverseOrder[atomConstraints[i][j]]-1);
}
}
ccmaDistance->upload(distanceVec);
ccmaReducedMass->upload(reducedMassVec);
ccmaConstraintMatrixValue->upload(constraintMatrixValueVec);
} }
ccmaAtoms->upload(atomsVec); ccmaAtoms->upload(atomsVec);
ccmaDistance->upload(distanceVec);
ccmaAtomConstraints->upload(atomConstraintsVec); ccmaAtomConstraints->upload(atomConstraintsVec);
ccmaNumAtomConstraints->upload(numAtomConstraintsVec); ccmaNumAtomConstraints->upload(numAtomConstraintsVec);
ccmaReducedMass->upload(reducedMassVec);
ccmaConstraintMatrixColumn->upload(constraintMatrixColumnVec); ccmaConstraintMatrixColumn->upload(constraintMatrixColumnVec);
ccmaConstraintMatrixValue->upload(constraintMatrixValueVec);
// Create the CCMA kernels. // Create the CCMA kernels.
...@@ -584,21 +636,23 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c ...@@ -584,21 +636,23 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
cl::Program vsiteProgram = context.createProgram(OpenCLKernelSources::virtualSites, defines); cl::Program vsiteProgram = context.createProgram(OpenCLKernelSources::virtualSites, defines);
vsitePositionKernel = cl::Kernel(vsiteProgram, "computeVirtualSites"); vsitePositionKernel = cl::Kernel(vsiteProgram, "computeVirtualSites");
vsitePositionKernel.setArg<cl::Buffer>(0, context.getPosq().getDeviceBuffer()); vsitePositionKernel.setArg<cl::Buffer>(0, context.getPosq().getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(1, vsite2AvgAtoms->getDeviceBuffer()); setPosqCorrectionArg(context, vsitePositionKernel, 1);
vsitePositionKernel.setArg<cl::Buffer>(2, vsite2AvgWeights->getDeviceBuffer()); vsitePositionKernel.setArg<cl::Buffer>(2, vsite2AvgAtoms->getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(3, vsite3AvgAtoms->getDeviceBuffer()); vsitePositionKernel.setArg<cl::Buffer>(3, vsite2AvgWeights->getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(4, vsite3AvgWeights->getDeviceBuffer()); vsitePositionKernel.setArg<cl::Buffer>(4, vsite3AvgAtoms->getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(5, vsiteOutOfPlaneAtoms->getDeviceBuffer()); vsitePositionKernel.setArg<cl::Buffer>(5, vsite3AvgWeights->getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(6, vsiteOutOfPlaneWeights->getDeviceBuffer()); vsitePositionKernel.setArg<cl::Buffer>(6, vsiteOutOfPlaneAtoms->getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(7, vsiteOutOfPlaneWeights->getDeviceBuffer());
vsiteForceKernel = cl::Kernel(vsiteProgram, "distributeForces"); vsiteForceKernel = cl::Kernel(vsiteProgram, "distributeForces");
vsiteForceKernel.setArg<cl::Buffer>(0, context.getPosq().getDeviceBuffer()); vsiteForceKernel.setArg<cl::Buffer>(0, context.getPosq().getDeviceBuffer());
// Skip argument 1: the force array hasn't been created yet. setPosqCorrectionArg(context, vsiteForceKernel, 1);
vsiteForceKernel.setArg<cl::Buffer>(2, vsite2AvgAtoms->getDeviceBuffer()); // Skip argument 2: the force array hasn't been created yet.
vsiteForceKernel.setArg<cl::Buffer>(3, vsite2AvgWeights->getDeviceBuffer()); vsiteForceKernel.setArg<cl::Buffer>(3, vsite2AvgAtoms->getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(4, vsite3AvgAtoms->getDeviceBuffer()); vsiteForceKernel.setArg<cl::Buffer>(4, vsite2AvgWeights->getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(5, vsite3AvgWeights->getDeviceBuffer()); vsiteForceKernel.setArg<cl::Buffer>(5, vsite3AvgAtoms->getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(6, vsiteOutOfPlaneAtoms->getDeviceBuffer()); vsiteForceKernel.setArg<cl::Buffer>(6, vsite3AvgWeights->getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(7, vsiteOutOfPlaneWeights->getDeviceBuffer()); vsiteForceKernel.setArg<cl::Buffer>(7, vsiteOutOfPlaneAtoms->getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(8, vsiteOutOfPlaneWeights->getDeviceBuffer());
numVsites = num2Avg+num3Avg+numOutOfPlane; numVsites = num2Avg+num3Avg+numOutOfPlane;
} }
...@@ -686,23 +740,37 @@ void OpenCLIntegrationUtilities::applyConstraints(bool constrainVelocities, doub ...@@ -686,23 +740,37 @@ void OpenCLIntegrationUtilities::applyConstraints(bool constrainVelocities, doub
if (!hasInitialized) { if (!hasInitialized) {
settleKernel.setArg<cl_int>(0, settleAtoms->getSize()); settleKernel.setArg<cl_int>(0, settleAtoms->getSize());
settleKernel.setArg<cl::Buffer>(2, context.getPosq().getDeviceBuffer()); settleKernel.setArg<cl::Buffer>(2, context.getPosq().getDeviceBuffer());
settleKernel.setArg<cl::Buffer>(3, posDelta->getDeviceBuffer()); if (context.getUseMixedPrecision())
settleKernel.setArg<cl::Buffer>(4, context.getVelm().getDeviceBuffer()); settleKernel.setArg<cl::Buffer>(3, context.getPosqCorrection().getDeviceBuffer());
settleKernel.setArg<cl::Buffer>(5, settleAtoms->getDeviceBuffer()); else
settleKernel.setArg<cl::Buffer>(6, settleParams->getDeviceBuffer()); settleKernel.setArg<void*>(3, NULL);
settleKernel.setArg<cl::Buffer>(4, posDelta->getDeviceBuffer());
settleKernel.setArg<cl::Buffer>(5, context.getVelm().getDeviceBuffer());
settleKernel.setArg<cl::Buffer>(6, settleAtoms->getDeviceBuffer());
settleKernel.setArg<cl::Buffer>(7, settleParams->getDeviceBuffer());
} }
settleKernel.setArg<cl_float>(1, (cl_float) tol); if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
settleKernel.setArg<cl_double>(1, (cl_double) tol);
else
settleKernel.setArg<cl_float>(1, (cl_float) tol);
context.executeKernel(settleKernel, settleAtoms->getSize()); context.executeKernel(settleKernel, settleAtoms->getSize());
} }
if (shakeAtoms != NULL) { if (shakeAtoms != NULL) {
if (!hasInitialized) { if (!hasInitialized) {
shakeKernel.setArg<cl_int>(0, shakeAtoms->getSize()); shakeKernel.setArg<cl_int>(0, shakeAtoms->getSize());
shakeKernel.setArg<cl::Buffer>(2, context.getPosq().getDeviceBuffer()); shakeKernel.setArg<cl::Buffer>(2, context.getPosq().getDeviceBuffer());
shakeKernel.setArg<cl::Buffer>(3, constrainVelocities ? context.getVelm().getDeviceBuffer() : posDelta->getDeviceBuffer()); if (context.getUseMixedPrecision())
shakeKernel.setArg<cl::Buffer>(4, shakeAtoms->getDeviceBuffer()); shakeKernel.setArg<cl::Buffer>(3, context.getPosqCorrection().getDeviceBuffer());
shakeKernel.setArg<cl::Buffer>(5, shakeParams->getDeviceBuffer()); else
shakeKernel.setArg<void*>(3, NULL);
shakeKernel.setArg<cl::Buffer>(4, constrainVelocities ? context.getVelm().getDeviceBuffer() : posDelta->getDeviceBuffer());
shakeKernel.setArg<cl::Buffer>(5, shakeAtoms->getDeviceBuffer());
shakeKernel.setArg<cl::Buffer>(6, shakeParams->getDeviceBuffer());
} }
shakeKernel.setArg<cl_float>(1, (cl_float) tol); if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
shakeKernel.setArg<cl_double>(1, (cl_double) tol);
else
shakeKernel.setArg<cl_float>(1, (cl_float) tol);
context.executeKernel(shakeKernel, shakeAtoms->getSize()); context.executeKernel(shakeKernel, shakeAtoms->getSize());
} }
if (ccmaAtoms != NULL) { if (ccmaAtoms != NULL) {
...@@ -710,6 +778,10 @@ void OpenCLIntegrationUtilities::applyConstraints(bool constrainVelocities, doub ...@@ -710,6 +778,10 @@ void OpenCLIntegrationUtilities::applyConstraints(bool constrainVelocities, doub
ccmaDirectionsKernel.setArg<cl::Buffer>(0, ccmaAtoms->getDeviceBuffer()); ccmaDirectionsKernel.setArg<cl::Buffer>(0, ccmaAtoms->getDeviceBuffer());
ccmaDirectionsKernel.setArg<cl::Buffer>(1, ccmaDistance->getDeviceBuffer()); ccmaDirectionsKernel.setArg<cl::Buffer>(1, ccmaDistance->getDeviceBuffer());
ccmaDirectionsKernel.setArg<cl::Buffer>(2, context.getPosq().getDeviceBuffer()); ccmaDirectionsKernel.setArg<cl::Buffer>(2, context.getPosq().getDeviceBuffer());
if (context.getUseMixedPrecision())
ccmaDirectionsKernel.setArg<cl::Buffer>(3, context.getPosqCorrection().getDeviceBuffer());
else
ccmaDirectionsKernel.setArg<void*>(3, NULL);
ccmaForceKernel.setArg<cl::Buffer>(0, ccmaAtoms->getDeviceBuffer()); ccmaForceKernel.setArg<cl::Buffer>(0, ccmaAtoms->getDeviceBuffer());
ccmaForceKernel.setArg<cl::Buffer>(1, ccmaDistance->getDeviceBuffer()); ccmaForceKernel.setArg<cl::Buffer>(1, ccmaDistance->getDeviceBuffer());
ccmaForceKernel.setArg<cl::Buffer>(2, constrainVelocities ? context.getVelm().getDeviceBuffer() : posDelta->getDeviceBuffer()); ccmaForceKernel.setArg<cl::Buffer>(2, constrainVelocities ? context.getVelm().getDeviceBuffer() : posDelta->getDeviceBuffer());
...@@ -730,7 +802,10 @@ void OpenCLIntegrationUtilities::applyConstraints(bool constrainVelocities, doub ...@@ -730,7 +802,10 @@ void OpenCLIntegrationUtilities::applyConstraints(bool constrainVelocities, doub
ccmaUpdateKernel.setArg<cl::Buffer>(6, ccmaDelta2->getDeviceBuffer()); ccmaUpdateKernel.setArg<cl::Buffer>(6, ccmaDelta2->getDeviceBuffer());
ccmaUpdateKernel.setArg<cl::Buffer>(7, ccmaConverged->getDeviceBuffer()); ccmaUpdateKernel.setArg<cl::Buffer>(7, ccmaConverged->getDeviceBuffer());
} }
ccmaForceKernel.setArg<cl_float>(6, (cl_float) tol); if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
ccmaForceKernel.setArg<cl_double>(6, (cl_double) tol);
else
ccmaForceKernel.setArg<cl_float>(6, (cl_float) tol);
context.executeKernel(ccmaDirectionsKernel, ccmaAtoms->getSize()); context.executeKernel(ccmaDirectionsKernel, ccmaAtoms->getSize());
const int checkInterval = 4; const int checkInterval = 4;
cl::Event event; cl::Event event;
...@@ -764,7 +839,7 @@ void OpenCLIntegrationUtilities::computeVirtualSites() { ...@@ -764,7 +839,7 @@ void OpenCLIntegrationUtilities::computeVirtualSites() {
void OpenCLIntegrationUtilities::distributeForcesFromVirtualSites() { void OpenCLIntegrationUtilities::distributeForcesFromVirtualSites() {
if (numVsites > 0) { if (numVsites > 0) {
vsiteForceKernel.setArg<cl::Buffer>(1, context.getForce().getDeviceBuffer()); vsiteForceKernel.setArg<cl::Buffer>(2, context.getForce().getDeviceBuffer());
context.executeKernel(vsiteForceKernel, numVsites); context.executeKernel(vsiteForceKernel, numVsites);
} }
} }
......
...@@ -66,6 +66,13 @@ static string intToString(int value) { ...@@ -66,6 +66,13 @@ static string intToString(int value) {
return s.str(); return s.str();
} }
static void setPosqCorrectionArg(OpenCLContext& cl, cl::Kernel& kernel, int index) {
if (cl.getUseMixedPrecision())
kernel.setArg<cl::Buffer>(index, cl.getPosqCorrection().getDeviceBuffer());
else
kernel.setArg<void*>(index, NULL);
}
static bool isZeroExpression(const Lepton::ParsedExpression& expression) { static bool isZeroExpression(const Lepton::ParsedExpression& expression) {
const Lepton::Operation& op = expression.getRootNode().getOperation(); const Lepton::Operation& op = expression.getRootNode().getOperation();
if (op.getId() != Lepton::Operation::CONSTANT) if (op.getId() != Lepton::Operation::CONSTANT)
...@@ -139,81 +146,172 @@ void OpenCLUpdateStateDataKernel::setTime(ContextImpl& context, double time) { ...@@ -139,81 +146,172 @@ void OpenCLUpdateStateDataKernel::setTime(ContextImpl& context, double time) {
} }
void OpenCLUpdateStateDataKernel::getPositions(ContextImpl& context, vector<Vec3>& positions) { void OpenCLUpdateStateDataKernel::getPositions(ContextImpl& context, vector<Vec3>& positions) {
mm_float4* posq = (mm_float4*) cl.getPinnedBuffer();
cl.getPosq().download(posq);
const vector<cl_int>& order = cl.getAtomIndex(); const vector<cl_int>& order = cl.getAtomIndex();
int numParticles = context.getSystem().getNumParticles(); int numParticles = context.getSystem().getNumParticles();
positions.resize(numParticles); positions.resize(numParticles);
mm_float4 periodicBoxSize = cl.getPeriodicBoxSize(); mm_double4 periodicBoxSize = cl.getPeriodicBoxSizeDouble();
for (int i = 0; i < numParticles; ++i) { if (cl.getUseDoublePrecision()) {
mm_float4 pos = posq[i]; mm_double4* posq = (mm_double4*) cl.getPinnedBuffer();
mm_int4 offset = cl.getPosCellOffsets()[i]; cl.getPosq().download(posq);
positions[order[i]] = Vec3(pos.x-offset.x*periodicBoxSize.x, pos.y-offset.y*periodicBoxSize.y, pos.z-offset.z*periodicBoxSize.z); for (int i = 0; i < numParticles; ++i) {
mm_double4 pos = posq[i];
mm_int4 offset = cl.getPosCellOffsets()[i];
positions[order[i]] = Vec3(pos.x-offset.x*periodicBoxSize.x, pos.y-offset.y*periodicBoxSize.y, pos.z-offset.z*periodicBoxSize.z);
}
}
else if (cl.getUseMixedPrecision()) {
mm_float4* posq = (mm_float4*) cl.getPinnedBuffer();
vector<mm_float4> posCorrection;
cl.getPosq().download(posq);
cl.getPosqCorrection().download(posCorrection);
for (int i = 0; i < numParticles; ++i) {
mm_float4 pos1 = posq[i];
mm_float4 pos2 = posCorrection[i];
mm_int4 offset = cl.getPosCellOffsets()[i];
positions[order[i]] = Vec3((double)pos1.x+(double)pos2.x-offset.x*periodicBoxSize.x, (double)pos1.y+(double)pos2.y-offset.y*periodicBoxSize.y, (double)pos1.z+(double)pos2.z-offset.z*periodicBoxSize.z);
}
}
else {
mm_float4* posq = (mm_float4*) cl.getPinnedBuffer();
cl.getPosq().download(posq);
for (int i = 0; i < numParticles; ++i) {
mm_float4 pos = posq[i];
mm_int4 offset = cl.getPosCellOffsets()[i];
positions[order[i]] = Vec3(pos.x-offset.x*periodicBoxSize.x, pos.y-offset.y*periodicBoxSize.y, pos.z-offset.z*periodicBoxSize.z);
}
} }
} }
void OpenCLUpdateStateDataKernel::setPositions(ContextImpl& context, const vector<Vec3>& positions) { void OpenCLUpdateStateDataKernel::setPositions(ContextImpl& context, const vector<Vec3>& positions) {
mm_float4* posq = (mm_float4*) cl.getPinnedBuffer();
cl.getPosq().download(posq);
const vector<cl_int>& order = cl.getAtomIndex(); const vector<cl_int>& order = cl.getAtomIndex();
int numParticles = context.getSystem().getNumParticles(); int numParticles = context.getSystem().getNumParticles();
for (int i = 0; i < numParticles; ++i) { if (cl.getUseDoublePrecision()) {
mm_float4& pos = posq[i]; mm_double4* posq = (mm_double4*) cl.getPinnedBuffer();
const Vec3& p = positions[order[i]]; cl.getPosq().download(posq);
pos.x = (cl_float) p[0]; for (int i = 0; i < numParticles; ++i) {
pos.y = (cl_float) p[1]; mm_double4& pos = posq[i];
pos.z = (cl_float) p[2]; const Vec3& p = positions[order[i]];
} pos.x = p[0];
for (int i = numParticles; i < cl.getPaddedNumAtoms(); i++) pos.y = p[1];
posq[i] = mm_float4(0.0f, 0.0f, 0.0f, 0.0f); pos.z = p[2];
cl.getPosq().upload(posq); }
for (int i = numParticles; i < cl.getPaddedNumAtoms(); i++)
posq[i] = mm_double4(0.0, 0.0, 0.0, 0.0);
cl.getPosq().upload(posq);
}
else {
mm_float4* posq = (mm_float4*) cl.getPinnedBuffer();
cl.getPosq().download(posq);
for (int i = 0; i < numParticles; ++i) {
mm_float4& pos = posq[i];
const Vec3& p = positions[order[i]];
pos.x = (cl_float) p[0];
pos.y = (cl_float) p[1];
pos.z = (cl_float) p[2];
}
for (int i = numParticles; i < cl.getPaddedNumAtoms(); i++)
posq[i] = mm_float4(0.0f, 0.0f, 0.0f, 0.0f);
cl.getPosq().upload(posq);
}
if (cl.getUseMixedPrecision()) {
mm_float4* posCorrection = (mm_float4*) cl.getPinnedBuffer();
for (int i = 0; i < numParticles; ++i) {
mm_float4& c = posCorrection[i];
const Vec3& p = positions[order[i]];
c.x = (cl_float) (p[0]-(cl_float)p[0]);
c.y = (cl_float) (p[1]-(cl_float)p[1]);
c.z = (cl_float) (p[2]-(cl_float)p[2]);
c.w = 0;
}
for (int i = numParticles; i < cl.getPaddedNumAtoms(); i++)
posCorrection[i] = mm_float4(0.0f, 0.0f, 0.0f, 0.0f);
cl.getPosqCorrection().upload(posCorrection);
}
for (int i = 0; i < (int) cl.getPosCellOffsets().size(); i++) for (int i = 0; i < (int) cl.getPosCellOffsets().size(); i++)
cl.getPosCellOffsets()[i] = mm_int4(0, 0, 0, 0); cl.getPosCellOffsets()[i] = mm_int4(0, 0, 0, 0);
} }
void OpenCLUpdateStateDataKernel::getVelocities(ContextImpl& context, vector<Vec3>& velocities) { void OpenCLUpdateStateDataKernel::getVelocities(ContextImpl& context, vector<Vec3>& velocities) {
mm_float4* velm = (mm_float4*) cl.getPinnedBuffer();
cl.getVelm().download(velm);
const vector<cl_int>& order = cl.getAtomIndex(); const vector<cl_int>& order = cl.getAtomIndex();
int numParticles = context.getSystem().getNumParticles(); int numParticles = context.getSystem().getNumParticles();
velocities.resize(numParticles); velocities.resize(numParticles);
for (int i = 0; i < numParticles; ++i) { if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()) {
mm_float4 vel = velm[i]; mm_double4* velm = (mm_double4*) cl.getPinnedBuffer();
velocities[order[i]] = Vec3(vel.x, vel.y, vel.z); cl.getVelm().download(velm);
for (int i = 0; i < numParticles; ++i) {
mm_double4 vel = velm[i];
mm_int4 offset = cl.getPosCellOffsets()[i];
velocities[order[i]] = Vec3(vel.x, vel.y, vel.z);
}
}
else {
mm_float4* velm = (mm_float4*) cl.getPinnedBuffer();
cl.getVelm().download(velm);
for (int i = 0; i < numParticles; ++i) {
mm_float4 vel = velm[i];
mm_int4 offset = cl.getPosCellOffsets()[i];
velocities[order[i]] = Vec3(vel.x, vel.y, vel.z);
}
} }
} }
void OpenCLUpdateStateDataKernel::setVelocities(ContextImpl& context, const vector<Vec3>& velocities) { void OpenCLUpdateStateDataKernel::setVelocities(ContextImpl& context, const vector<Vec3>& velocities) {
mm_float4* velm = (mm_float4*) cl.getPinnedBuffer();
cl.getVelm().download(velm);
const vector<cl_int>& order = cl.getAtomIndex(); const vector<cl_int>& order = cl.getAtomIndex();
int numParticles = context.getSystem().getNumParticles(); int numParticles = context.getSystem().getNumParticles();
for (int i = 0; i < numParticles; ++i) { if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()) {
mm_float4& vel = velm[i]; mm_double4* velm = (mm_double4*) cl.getPinnedBuffer();
const Vec3& p = velocities[order[i]]; cl.getVelm().download(velm);
vel.x = (cl_float) p[0]; for (int i = 0; i < numParticles; ++i) {
vel.y = (cl_float) p[1]; mm_double4& vel = velm[i];
vel.z = (cl_float) p[2]; const Vec3& p = velocities[order[i]];
} vel.x = p[0];
for (int i = numParticles; i < cl.getPaddedNumAtoms(); i++) vel.y = p[1];
velm[i] = mm_float4(0.0f, 0.0f, 0.0f, 0.0f); vel.z = p[2];
cl.getVelm().upload(velm); }
for (int i = numParticles; i < cl.getPaddedNumAtoms(); i++)
velm[i] = mm_double4(0.0, 0.0, 0.0, 0.0);
cl.getVelm().upload(velm);
}
else {
mm_float4* velm = (mm_float4*) cl.getPinnedBuffer();
cl.getVelm().download(velm);
for (int i = 0; i < numParticles; ++i) {
mm_float4& vel = velm[i];
const Vec3& p = velocities[order[i]];
vel.x = p[0];
vel.y = p[1];
vel.z = p[2];
}
for (int i = numParticles; i < cl.getPaddedNumAtoms(); i++)
velm[i] = mm_float4(0.0f, 0.0f, 0.0f, 0.0f);
cl.getVelm().upload(velm);
}
} }
void OpenCLUpdateStateDataKernel::getForces(ContextImpl& context, vector<Vec3>& forces) { void OpenCLUpdateStateDataKernel::getForces(ContextImpl& context, vector<Vec3>& forces) {
mm_float4* force = (mm_float4*) cl.getPinnedBuffer();
cl.getForce().download(force);
const vector<cl_int>& order = cl.getAtomIndex(); const vector<cl_int>& order = cl.getAtomIndex();
int numParticles = context.getSystem().getNumParticles(); int numParticles = context.getSystem().getNumParticles();
forces.resize(numParticles); forces.resize(numParticles);
for (int i = 0; i < numParticles; ++i) { if (cl.getUseDoublePrecision()) {
mm_float4 f = force[i]; mm_double4* force = (mm_double4*) cl.getPinnedBuffer();
forces[order[i]] = Vec3(f.x, f.y, f.z); cl.getForce().download(force);
for (int i = 0; i < numParticles; ++i) {
mm_double4 f = force[i];
forces[order[i]] = Vec3(f.x, f.y, f.z);
}
}
else {
mm_float4* force = (mm_float4*) cl.getPinnedBuffer();
cl.getForce().download(force);
for (int i = 0; i < numParticles; ++i) {
mm_float4 f = force[i];
forces[order[i]] = Vec3(f.x, f.y, f.z);
}
} }
} }
void OpenCLUpdateStateDataKernel::getPeriodicBoxVectors(ContextImpl& context, Vec3& a, Vec3& b, Vec3& c) const { void OpenCLUpdateStateDataKernel::getPeriodicBoxVectors(ContextImpl& context, Vec3& a, Vec3& b, Vec3& c) const {
mm_float4 box = cl.getPeriodicBoxSize(); mm_double4 box = cl.getPeriodicBoxSizeDouble();
a = Vec3(box.x, 0, 0); a = Vec3(box.x, 0, 0);
b = Vec3(0, box.y, 0); b = Vec3(0, box.y, 0);
c = Vec3(0, 0, box.z); c = Vec3(0, 0, box.z);
...@@ -228,6 +326,8 @@ void OpenCLUpdateStateDataKernel::setPeriodicBoxVectors(ContextImpl& context, co ...@@ -228,6 +326,8 @@ void OpenCLUpdateStateDataKernel::setPeriodicBoxVectors(ContextImpl& context, co
void OpenCLUpdateStateDataKernel::createCheckpoint(ContextImpl& context, ostream& stream) { void OpenCLUpdateStateDataKernel::createCheckpoint(ContextImpl& context, ostream& stream) {
int version = 1; int version = 1;
stream.write((char*) &version, sizeof(int)); stream.write((char*) &version, sizeof(int));
int precision = (cl.getUseDoublePrecision() ? 2 : cl.getUseMixedPrecision() ? 1 : 0);
stream.write((char*) &precision, sizeof(int));
double time = cl.getTime(); double time = cl.getTime();
stream.write((char*) &time, sizeof(double)); stream.write((char*) &time, sizeof(double));
int stepCount = cl.getStepCount(); int stepCount = cl.getStepCount();
...@@ -235,10 +335,14 @@ void OpenCLUpdateStateDataKernel::createCheckpoint(ContextImpl& context, ostream ...@@ -235,10 +335,14 @@ void OpenCLUpdateStateDataKernel::createCheckpoint(ContextImpl& context, ostream
int computeForceCount = cl.getComputeForceCount(); int computeForceCount = cl.getComputeForceCount();
stream.write((char*) &computeForceCount, sizeof(int)); stream.write((char*) &computeForceCount, sizeof(int));
char* buffer = (char*) cl.getPinnedBuffer(); char* buffer = (char*) cl.getPinnedBuffer();
cl.getPosq().download((mm_float4*) buffer); cl.getPosq().download(buffer);
stream.write(buffer, sizeof(mm_float4)*cl.getPosq().getSize()); stream.write(buffer, cl.getPosq().getSize()*cl.getPosq().getElementSize());
cl.getVelm().download((mm_float4*) buffer); if (cl.getUseMixedPrecision()) {
stream.write(buffer, sizeof(mm_float4)*cl.getVelm().getSize()); cl.getPosqCorrection().download(buffer);
stream.write(buffer, cl.getPosqCorrection().getSize()*cl.getPosqCorrection().getElementSize());
}
cl.getVelm().download(buffer);
stream.write(buffer, cl.getVelm().getSize()*cl.getVelm().getElementSize());
stream.write((char*) &cl.getAtomIndex()[0], sizeof(cl_int)*cl.getAtomIndex().size()); stream.write((char*) &cl.getAtomIndex()[0], sizeof(cl_int)*cl.getAtomIndex().size());
stream.write((char*) &cl.getPosCellOffsets()[0], sizeof(mm_int4)*cl.getPosCellOffsets().size()); stream.write((char*) &cl.getPosCellOffsets()[0], sizeof(mm_int4)*cl.getPosCellOffsets().size());
mm_float4 box = cl.getPeriodicBoxSize(); mm_float4 box = cl.getPeriodicBoxSize();
...@@ -252,6 +356,11 @@ void OpenCLUpdateStateDataKernel::loadCheckpoint(ContextImpl& context, istream& ...@@ -252,6 +356,11 @@ void OpenCLUpdateStateDataKernel::loadCheckpoint(ContextImpl& context, istream&
stream.read((char*) &version, sizeof(int)); stream.read((char*) &version, sizeof(int));
if (version != 1) if (version != 1)
throw OpenMMException("Checkpoint was created with a different version of OpenMM"); throw OpenMMException("Checkpoint was created with a different version of OpenMM");
int precision;
stream.read((char*) &precision, sizeof(int));
int expectedPrecision = (cl.getUseDoublePrecision() ? 2 : cl.getUseMixedPrecision() ? 1 : 0);
if (precision != expectedPrecision)
throw OpenMMException("Checkpoint was created with a different numeric precision");
double time; double time;
stream.read((char*) &time, sizeof(double)); stream.read((char*) &time, sizeof(double));
int stepCount, computeForceCount; int stepCount, computeForceCount;
...@@ -264,9 +373,13 @@ void OpenCLUpdateStateDataKernel::loadCheckpoint(ContextImpl& context, istream& ...@@ -264,9 +373,13 @@ void OpenCLUpdateStateDataKernel::loadCheckpoint(ContextImpl& context, istream&
contexts[i]->setComputeForceCount(computeForceCount); contexts[i]->setComputeForceCount(computeForceCount);
} }
char* buffer = (char*) cl.getPinnedBuffer(); char* buffer = (char*) cl.getPinnedBuffer();
stream.read(buffer, sizeof(mm_float4)*cl.getPosq().getSize()); stream.read(buffer, cl.getPosq().getSize()*cl.getPosq().getElementSize());
cl.getPosq().upload(buffer); cl.getPosq().upload(buffer);
stream.read(buffer, sizeof(mm_float4)*cl.getVelm().getSize()); if (cl.getUseMixedPrecision()) {
stream.read(buffer, cl.getPosqCorrection().getSize()*cl.getPosqCorrection().getElementSize());
cl.getPosqCorrection().upload(buffer);
}
stream.read(buffer, cl.getVelm().getSize()*cl.getVelm().getElementSize());
cl.getVelm().upload(buffer); cl.getVelm().upload(buffer);
stream.read((char*) &cl.getAtomIndex()[0], sizeof(cl_int)*cl.getAtomIndex().size()); stream.read((char*) &cl.getAtomIndex()[0], sizeof(cl_int)*cl.getAtomIndex().size());
cl.getAtomIndexArray().upload(cl.getAtomIndex()); cl.getAtomIndexArray().upload(cl.getAtomIndex());
...@@ -292,7 +405,8 @@ void OpenCLApplyConstraintsKernel::apply(ContextImpl& context, double tol) { ...@@ -292,7 +405,8 @@ void OpenCLApplyConstraintsKernel::apply(ContextImpl& context, double tol) {
cl::Program program = cl.createProgram(OpenCLKernelSources::constraints, defines); cl::Program program = cl.createProgram(OpenCLKernelSources::constraints, defines);
applyDeltasKernel = cl::Kernel(program, "applyPositionDeltas"); applyDeltasKernel = cl::Kernel(program, "applyPositionDeltas");
applyDeltasKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer()); applyDeltasKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
applyDeltasKernel.setArg<cl::Buffer>(1, cl.getIntegrationUtilities().getPosDelta().getDeviceBuffer()); setPosqCorrectionArg(cl, applyDeltasKernel, 1);
applyDeltasKernel.setArg<cl::Buffer>(2, cl.getIntegrationUtilities().getPosDelta().getDeviceBuffer());
} }
OpenCLIntegrationUtilities& integration = cl.getIntegrationUtilities(); OpenCLIntegrationUtilities& integration = cl.getIntegrationUtilities();
cl.clearBuffer(integration.getPosDelta()); cl.clearBuffer(integration.getPosDelta());
...@@ -4000,19 +4114,28 @@ void OpenCLIntegrateVerletStepKernel::execute(ContextImpl& context, const Verlet ...@@ -4000,19 +4114,28 @@ void OpenCLIntegrateVerletStepKernel::execute(ContextImpl& context, const Verlet
kernel1.setArg<cl_int>(0, numAtoms); kernel1.setArg<cl_int>(0, numAtoms);
kernel1.setArg<cl::Buffer>(1, cl.getIntegrationUtilities().getStepSize().getDeviceBuffer()); kernel1.setArg<cl::Buffer>(1, cl.getIntegrationUtilities().getStepSize().getDeviceBuffer());
kernel1.setArg<cl::Buffer>(2, cl.getPosq().getDeviceBuffer()); kernel1.setArg<cl::Buffer>(2, cl.getPosq().getDeviceBuffer());
kernel1.setArg<cl::Buffer>(3, cl.getVelm().getDeviceBuffer()); setPosqCorrectionArg(cl, kernel1, 3);
kernel1.setArg<cl::Buffer>(4, cl.getForce().getDeviceBuffer()); kernel1.setArg<cl::Buffer>(4, cl.getVelm().getDeviceBuffer());
kernel1.setArg<cl::Buffer>(5, integration.getPosDelta().getDeviceBuffer()); kernel1.setArg<cl::Buffer>(5, cl.getForce().getDeviceBuffer());
kernel1.setArg<cl::Buffer>(6, integration.getPosDelta().getDeviceBuffer());
kernel2.setArg<cl_int>(0, numAtoms); kernel2.setArg<cl_int>(0, numAtoms);
kernel2.setArg<cl::Buffer>(1, cl.getIntegrationUtilities().getStepSize().getDeviceBuffer()); kernel2.setArg<cl::Buffer>(1, cl.getIntegrationUtilities().getStepSize().getDeviceBuffer());
kernel2.setArg<cl::Buffer>(2, cl.getPosq().getDeviceBuffer()); kernel2.setArg<cl::Buffer>(2, cl.getPosq().getDeviceBuffer());
kernel2.setArg<cl::Buffer>(3, cl.getVelm().getDeviceBuffer()); setPosqCorrectionArg(cl, kernel2, 3);
kernel2.setArg<cl::Buffer>(4, integration.getPosDelta().getDeviceBuffer()); kernel2.setArg<cl::Buffer>(4, cl.getVelm().getDeviceBuffer());
kernel2.setArg<cl::Buffer>(5, integration.getPosDelta().getDeviceBuffer());
} }
if (dt != prevStepSize) { if (dt != prevStepSize) {
vector<mm_float2> stepSizeVec(1); if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()) {
stepSizeVec[0] = mm_float2((cl_float) dt, (cl_float) dt); vector<mm_double2> stepSizeVec(1);
cl.getIntegrationUtilities().getStepSize().upload(stepSizeVec); stepSizeVec[0] = mm_double2(dt, dt);
cl.getIntegrationUtilities().getStepSize().upload(stepSizeVec);
}
else {
vector<mm_float2> stepSizeVec(1);
stepSizeVec[0] = mm_float2((cl_float) dt, (cl_float) dt);
cl.getIntegrationUtilities().getStepSize().upload(stepSizeVec);
}
prevStepSize = dt; prevStepSize = dt;
} }
...@@ -4055,7 +4178,7 @@ void OpenCLIntegrateLangevinStepKernel::initialize(const System& system, const L ...@@ -4055,7 +4178,7 @@ void OpenCLIntegrateLangevinStepKernel::initialize(const System& system, const L
cl::Program program = cl.createProgram(OpenCLKernelSources::langevin, defines, ""); cl::Program program = cl.createProgram(OpenCLKernelSources::langevin, defines, "");
kernel1 = cl::Kernel(program, "integrateLangevinPart1"); kernel1 = cl::Kernel(program, "integrateLangevinPart1");
kernel2 = cl::Kernel(program, "integrateLangevinPart2"); kernel2 = cl::Kernel(program, "integrateLangevinPart2");
params = OpenCLArray::create<cl_float>(cl, 3, "langevinParams"); params = new OpenCLArray(cl, 3, cl.getUseDoublePrecision() || cl.getUseMixedPrecision() ? sizeof(cl_double) : sizeof(cl_float), "langevinParams");
prevStepSize = -1.0; prevStepSize = -1.0;
} }
...@@ -4071,9 +4194,10 @@ void OpenCLIntegrateLangevinStepKernel::execute(ContextImpl& context, const Lang ...@@ -4071,9 +4194,10 @@ void OpenCLIntegrateLangevinStepKernel::execute(ContextImpl& context, const Lang
kernel1.setArg<cl::Buffer>(4, integration.getStepSize().getDeviceBuffer()); kernel1.setArg<cl::Buffer>(4, integration.getStepSize().getDeviceBuffer());
kernel1.setArg<cl::Buffer>(5, integration.getRandom().getDeviceBuffer()); kernel1.setArg<cl::Buffer>(5, integration.getRandom().getDeviceBuffer());
kernel2.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer()); kernel2.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
kernel2.setArg<cl::Buffer>(1, integration.getPosDelta().getDeviceBuffer()); setPosqCorrectionArg(cl, kernel2, 1);
kernel2.setArg<cl::Buffer>(2, cl.getVelm().getDeviceBuffer()); kernel2.setArg<cl::Buffer>(2, integration.getPosDelta().getDeviceBuffer());
kernel2.setArg<cl::Buffer>(3, integration.getStepSize().getDeviceBuffer()); kernel2.setArg<cl::Buffer>(3, cl.getVelm().getDeviceBuffer());
kernel2.setArg<cl::Buffer>(4, integration.getStepSize().getDeviceBuffer());
} }
double temperature = integrator.getTemperature(); double temperature = integrator.getTemperature();
double friction = integrator.getFriction(); double friction = integrator.getFriction();
...@@ -4086,13 +4210,24 @@ void OpenCLIntegrateLangevinStepKernel::execute(ContextImpl& context, const Lang ...@@ -4086,13 +4210,24 @@ void OpenCLIntegrateLangevinStepKernel::execute(ContextImpl& context, const Lang
double vscale = exp(-stepSize/tau); double vscale = exp(-stepSize/tau);
double fscale = (1-vscale)*tau; double fscale = (1-vscale)*tau;
double noisescale = sqrt(2*kT/tau)*sqrt(0.5*(1-vscale*vscale)*tau); double noisescale = sqrt(2*kT/tau)*sqrt(0.5*(1-vscale*vscale)*tau);
vector<cl_float> p(params->getSize()); if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()) {
p[0] = (cl_float) vscale; vector<cl_double> p(params->getSize());
p[1] = (cl_float) fscale; p[0] = vscale;
p[2] = (cl_float) noisescale; p[1] = fscale;
params->upload(p); p[2] = noisescale;
mm_float2 ss = mm_float2(0, (float) stepSize); params->upload(p);
integration.getStepSize().upload(&ss); mm_double2 ss = mm_double2(0, stepSize);
integration.getStepSize().upload(&ss);
}
else {
vector<cl_float> p(params->getSize());
p[0] = (cl_float) vscale;
p[1] = (cl_float) fscale;
p[2] = (cl_float) noisescale;
params->upload(p);
mm_float2 ss = mm_float2(0, (float) stepSize);
integration.getStepSize().upload(&ss);
}
prevTemp = temperature; prevTemp = temperature;
prevFriction = friction; prevFriction = friction;
prevStepSize = stepSize; prevStepSize = stepSize;
...@@ -4148,17 +4283,25 @@ void OpenCLIntegrateBrownianStepKernel::execute(ContextImpl& context, const Brow ...@@ -4148,17 +4283,25 @@ void OpenCLIntegrateBrownianStepKernel::execute(ContextImpl& context, const Brow
kernel1.setArg<cl::Buffer>(4, cl.getVelm().getDeviceBuffer()); kernel1.setArg<cl::Buffer>(4, cl.getVelm().getDeviceBuffer());
kernel1.setArg<cl::Buffer>(5, integration.getRandom().getDeviceBuffer()); kernel1.setArg<cl::Buffer>(5, integration.getRandom().getDeviceBuffer());
kernel2.setArg<cl::Buffer>(1, cl.getPosq().getDeviceBuffer()); kernel2.setArg<cl::Buffer>(1, cl.getPosq().getDeviceBuffer());
kernel2.setArg<cl::Buffer>(2, cl.getVelm().getDeviceBuffer()); setPosqCorrectionArg(cl, kernel2, 2);
kernel2.setArg<cl::Buffer>(3, integration.getPosDelta().getDeviceBuffer()); kernel2.setArg<cl::Buffer>(3, cl.getVelm().getDeviceBuffer());
kernel2.setArg<cl::Buffer>(4, integration.getPosDelta().getDeviceBuffer());
} }
double temperature = integrator.getTemperature(); double temperature = integrator.getTemperature();
double friction = integrator.getFriction(); double friction = integrator.getFriction();
double stepSize = integrator.getStepSize(); double stepSize = integrator.getStepSize();
if (temperature != prevTemp || friction != prevFriction || stepSize != prevStepSize) { if (temperature != prevTemp || friction != prevFriction || stepSize != prevStepSize) {
double tau = (friction == 0.0 ? 0.0 : 1.0/friction); double tau = (friction == 0.0 ? 0.0 : 1.0/friction);
kernel1.setArg<cl_float>(0, (cl_float) (tau*stepSize)); if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()) {
kernel1.setArg<cl_float>(1, (cl_float) (sqrt(2.0f*BOLTZ*temperature*stepSize*tau))); kernel1.setArg<cl_double>(0, tau*stepSize);
kernel2.setArg<cl_float>(0, (cl_float) (1.0/stepSize)); kernel1.setArg<cl_double>(1, sqrt(2.0f*BOLTZ*temperature*stepSize*tau));
kernel2.setArg<cl_double>(0, 1.0/stepSize);
}
else {
kernel1.setArg<cl_float>(0, (cl_float) (tau*stepSize));
kernel1.setArg<cl_float>(1, (cl_float) (sqrt(2.0f*BOLTZ*temperature*stepSize*tau)));
kernel2.setArg<cl_float>(0, (cl_float) (1.0/stepSize));
}
prevTemp = temperature; prevTemp = temperature;
prevFriction = friction; prevFriction = friction;
prevStepSize = stepSize; prevStepSize = stepSize;
...@@ -4205,19 +4348,22 @@ void OpenCLIntegrateVariableVerletStepKernel::initialize(const System& system, c ...@@ -4205,19 +4348,22 @@ void OpenCLIntegrateVariableVerletStepKernel::initialize(const System& system, c
double OpenCLIntegrateVariableVerletStepKernel::execute(ContextImpl& context, const VariableVerletIntegrator& integrator, double maxTime) { double OpenCLIntegrateVariableVerletStepKernel::execute(ContextImpl& context, const VariableVerletIntegrator& integrator, double maxTime) {
OpenCLIntegrationUtilities& integration = cl.getIntegrationUtilities(); OpenCLIntegrationUtilities& integration = cl.getIntegrationUtilities();
int numAtoms = cl.getNumAtoms(); int numAtoms = cl.getNumAtoms();
bool useDouble = cl.getUseDoublePrecision() || cl.getUseMixedPrecision();
if (!hasInitializedKernels) { if (!hasInitializedKernels) {
hasInitializedKernels = true; hasInitializedKernels = true;
kernel1.setArg<cl_int>(0, numAtoms); kernel1.setArg<cl_int>(0, numAtoms);
kernel1.setArg<cl::Buffer>(1, cl.getIntegrationUtilities().getStepSize().getDeviceBuffer()); kernel1.setArg<cl::Buffer>(1, cl.getIntegrationUtilities().getStepSize().getDeviceBuffer());
kernel1.setArg<cl::Buffer>(2, cl.getPosq().getDeviceBuffer()); kernel1.setArg<cl::Buffer>(2, cl.getPosq().getDeviceBuffer());
kernel1.setArg<cl::Buffer>(3, cl.getVelm().getDeviceBuffer()); setPosqCorrectionArg(cl, kernel1, 3);
kernel1.setArg<cl::Buffer>(4, cl.getForce().getDeviceBuffer()); kernel1.setArg<cl::Buffer>(4, cl.getVelm().getDeviceBuffer());
kernel1.setArg<cl::Buffer>(5, integration.getPosDelta().getDeviceBuffer()); kernel1.setArg<cl::Buffer>(5, cl.getForce().getDeviceBuffer());
kernel1.setArg<cl::Buffer>(6, integration.getPosDelta().getDeviceBuffer());
kernel2.setArg<cl_int>(0, numAtoms); kernel2.setArg<cl_int>(0, numAtoms);
kernel2.setArg<cl::Buffer>(1, cl.getIntegrationUtilities().getStepSize().getDeviceBuffer()); kernel2.setArg<cl::Buffer>(1, cl.getIntegrationUtilities().getStepSize().getDeviceBuffer());
kernel2.setArg<cl::Buffer>(2, cl.getPosq().getDeviceBuffer()); kernel2.setArg<cl::Buffer>(2, cl.getPosq().getDeviceBuffer());
kernel2.setArg<cl::Buffer>(3, cl.getVelm().getDeviceBuffer()); setPosqCorrectionArg(cl, kernel2, 3);
kernel2.setArg<cl::Buffer>(4, integration.getPosDelta().getDeviceBuffer()); kernel2.setArg<cl::Buffer>(4, cl.getVelm().getDeviceBuffer());
kernel2.setArg<cl::Buffer>(5, integration.getPosDelta().getDeviceBuffer());
selectSizeKernel.setArg<cl_int>(0, numAtoms); selectSizeKernel.setArg<cl_int>(0, numAtoms);
selectSizeKernel.setArg<cl::Buffer>(3, cl.getIntegrationUtilities().getStepSize().getDeviceBuffer()); selectSizeKernel.setArg<cl::Buffer>(3, cl.getIntegrationUtilities().getStepSize().getDeviceBuffer());
selectSizeKernel.setArg<cl::Buffer>(4, cl.getVelm().getDeviceBuffer()); selectSizeKernel.setArg<cl::Buffer>(4, cl.getVelm().getDeviceBuffer());
...@@ -4227,9 +4373,16 @@ double OpenCLIntegrateVariableVerletStepKernel::execute(ContextImpl& context, co ...@@ -4227,9 +4373,16 @@ double OpenCLIntegrateVariableVerletStepKernel::execute(ContextImpl& context, co
// Select the step size to use. // Select the step size to use.
float maxStepSize = (float)(maxTime-cl.getTime()); double maxStepSize = maxTime-cl.getTime();
selectSizeKernel.setArg<cl_float>(1, maxStepSize); float maxStepSizeFloat = (float) maxStepSize;
selectSizeKernel.setArg<cl_float>(2, (cl_float) integrator.getErrorTolerance()); if (useDouble) {
selectSizeKernel.setArg<cl_double>(1, maxStepSize);
selectSizeKernel.setArg<cl_double>(2, integrator.getErrorTolerance());
}
else {
selectSizeKernel.setArg<cl_float>(1, maxStepSizeFloat);
selectSizeKernel.setArg<cl_float>(2, (cl_float) integrator.getErrorTolerance());
}
cl.executeKernel(selectSizeKernel, blockSize, blockSize); cl.executeKernel(selectSizeKernel, blockSize, blockSize);
// Call the first integration kernel. // Call the first integration kernel.
...@@ -4253,12 +4406,23 @@ double OpenCLIntegrateVariableVerletStepKernel::execute(ContextImpl& context, co ...@@ -4253,12 +4406,23 @@ double OpenCLIntegrateVariableVerletStepKernel::execute(ContextImpl& context, co
// Update the time and step count. // Update the time and step count.
mm_float2 stepSize; double dt, time;
cl.getIntegrationUtilities().getStepSize().download(&stepSize); if (useDouble) {
double dt = stepSize.y; mm_double2 stepSize;
double time = cl.getTime()+dt; cl.getIntegrationUtilities().getStepSize().download(&stepSize);
if (dt == maxStepSize) dt = stepSize.y;
time = maxTime; // Avoid round-off error time = cl.getTime()+dt;
if (dt == maxStepSize)
time = maxTime; // Avoid round-off error
}
else {
mm_float2 stepSize;
cl.getIntegrationUtilities().getStepSize().download(&stepSize);
dt = stepSize.y;
time = cl.getTime()+dt;
if (dt == maxStepSizeFloat)
time = maxTime; // Avoid round-off error
}
cl.setTime(time); cl.setTime(time);
cl.setStepCount(cl.getStepCount()+1); cl.setStepCount(cl.getStepCount()+1);
return dt; return dt;
...@@ -4279,7 +4443,7 @@ void OpenCLIntegrateVariableLangevinStepKernel::initialize(const System& system, ...@@ -4279,7 +4443,7 @@ void OpenCLIntegrateVariableLangevinStepKernel::initialize(const System& system,
kernel1 = cl::Kernel(program, "integrateLangevinPart1"); kernel1 = cl::Kernel(program, "integrateLangevinPart1");
kernel2 = cl::Kernel(program, "integrateLangevinPart2"); kernel2 = cl::Kernel(program, "integrateLangevinPart2");
selectSizeKernel = cl::Kernel(program, "selectLangevinStepSize"); selectSizeKernel = cl::Kernel(program, "selectLangevinStepSize");
params = OpenCLArray::create<cl_float>(cl, 3, "langevinParams"); params = new OpenCLArray(cl, 3, cl.getUseDoublePrecision() || cl.getUseMixedPrecision() ? sizeof(cl_double) : sizeof(cl_float), "langevinParams");
blockSize = min(256, system.getNumParticles()); blockSize = min(256, system.getNumParticles());
blockSize = max(blockSize, params->getSize()); blockSize = max(blockSize, params->getSize());
blockSize = min(blockSize, (int) cl.getDevice().getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>()); blockSize = min(blockSize, (int) cl.getDevice().getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>());
...@@ -4288,6 +4452,7 @@ void OpenCLIntegrateVariableLangevinStepKernel::initialize(const System& system, ...@@ -4288,6 +4452,7 @@ void OpenCLIntegrateVariableLangevinStepKernel::initialize(const System& system,
double OpenCLIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, const VariableLangevinIntegrator& integrator, double maxTime) { double OpenCLIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, const VariableLangevinIntegrator& integrator, double maxTime) {
OpenCLIntegrationUtilities& integration = cl.getIntegrationUtilities(); OpenCLIntegrationUtilities& integration = cl.getIntegrationUtilities();
int numAtoms = cl.getNumAtoms(); int numAtoms = cl.getNumAtoms();
bool useDouble = cl.getUseDoublePrecision() || cl.getUseMixedPrecision();
if (!hasInitializedKernels) { if (!hasInitializedKernels) {
hasInitializedKernels = true; hasInitializedKernels = true;
kernel1.setArg<cl::Buffer>(0, cl.getVelm().getDeviceBuffer()); kernel1.setArg<cl::Buffer>(0, cl.getVelm().getDeviceBuffer());
...@@ -4297,9 +4462,10 @@ double OpenCLIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, ...@@ -4297,9 +4462,10 @@ double OpenCLIntegrateVariableLangevinStepKernel::execute(ContextImpl& context,
kernel1.setArg<cl::Buffer>(4, integration.getStepSize().getDeviceBuffer()); kernel1.setArg<cl::Buffer>(4, integration.getStepSize().getDeviceBuffer());
kernel1.setArg<cl::Buffer>(5, integration.getRandom().getDeviceBuffer()); kernel1.setArg<cl::Buffer>(5, integration.getRandom().getDeviceBuffer());
kernel2.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer()); kernel2.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
kernel2.setArg<cl::Buffer>(1, integration.getPosDelta().getDeviceBuffer()); setPosqCorrectionArg(cl, kernel2, 1);
kernel2.setArg<cl::Buffer>(2, cl.getVelm().getDeviceBuffer()); kernel2.setArg<cl::Buffer>(2, integration.getPosDelta().getDeviceBuffer());
kernel2.setArg<cl::Buffer>(3, integration.getStepSize().getDeviceBuffer()); kernel2.setArg<cl::Buffer>(3, cl.getVelm().getDeviceBuffer());
kernel2.setArg<cl::Buffer>(4, integration.getStepSize().getDeviceBuffer());
selectSizeKernel.setArg<cl::Buffer>(4, integration.getStepSize().getDeviceBuffer()); selectSizeKernel.setArg<cl::Buffer>(4, integration.getStepSize().getDeviceBuffer());
selectSizeKernel.setArg<cl::Buffer>(5, cl.getVelm().getDeviceBuffer()); selectSizeKernel.setArg<cl::Buffer>(5, cl.getVelm().getDeviceBuffer());
selectSizeKernel.setArg<cl::Buffer>(6, cl.getForce().getDeviceBuffer()); selectSizeKernel.setArg<cl::Buffer>(6, cl.getForce().getDeviceBuffer());
...@@ -4310,11 +4476,20 @@ double OpenCLIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, ...@@ -4310,11 +4476,20 @@ double OpenCLIntegrateVariableLangevinStepKernel::execute(ContextImpl& context,
// Select the step size to use. // Select the step size to use.
float maxStepSize = (float)(maxTime-cl.getTime()); double maxStepSize = maxTime-cl.getTime();
selectSizeKernel.setArg<cl_float>(0, maxStepSize); float maxStepSizeFloat = (float) maxStepSize;
selectSizeKernel.setArg<cl_float>(1, (cl_float) integrator.getErrorTolerance()); if (useDouble) {
selectSizeKernel.setArg<cl_float>(2, (cl_float) (integrator.getFriction() == 0.0 ? 0.0 : 1.0/integrator.getFriction())); selectSizeKernel.setArg<cl_double>(0, maxStepSize);
selectSizeKernel.setArg<cl_float>(3, (cl_float) (BOLTZ*integrator.getTemperature())); selectSizeKernel.setArg<cl_double>(1, integrator.getErrorTolerance());
selectSizeKernel.setArg<cl_double>(2, integrator.getFriction() == 0.0 ? 0.0 : 1.0/integrator.getFriction());
selectSizeKernel.setArg<cl_double>(3, BOLTZ*integrator.getTemperature());
}
else {
selectSizeKernel.setArg<cl_float>(0, maxStepSizeFloat);
selectSizeKernel.setArg<cl_float>(1, (cl_float) integrator.getErrorTolerance());
selectSizeKernel.setArg<cl_float>(2, (cl_float) (integrator.getFriction() == 0.0 ? 0.0 : 1.0/integrator.getFriction()));
selectSizeKernel.setArg<cl_float>(3, (cl_float) (BOLTZ*integrator.getTemperature()));
}
cl.executeKernel(selectSizeKernel, blockSize, blockSize); cl.executeKernel(selectSizeKernel, blockSize, blockSize);
// Call the first integration kernel. // Call the first integration kernel.
...@@ -4339,12 +4514,23 @@ double OpenCLIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, ...@@ -4339,12 +4514,23 @@ double OpenCLIntegrateVariableLangevinStepKernel::execute(ContextImpl& context,
// Update the time and step count. // Update the time and step count.
mm_float2 stepSize; double dt, time;
cl.getIntegrationUtilities().getStepSize().download(&stepSize); if (useDouble) {
double dt = stepSize.y; mm_double2 stepSize;
double time = cl.getTime()+dt; cl.getIntegrationUtilities().getStepSize().download(&stepSize);
if (dt == maxStepSize) dt = stepSize.y;
time = maxTime; // Avoid round-off error time = cl.getTime()+dt;
if (dt == maxStepSize)
time = maxTime; // Avoid round-off error
}
else {
mm_float2 stepSize;
cl.getIntegrationUtilities().getStepSize().download(&stepSize);
dt = stepSize.y;
time = cl.getTime()+dt;
if (dt == maxStepSizeFloat)
time = maxTime; // Avoid round-off error
}
cl.setTime(time); cl.setTime(time);
cl.setStepCount(cl.getStepCount()+1); cl.setStepCount(cl.getStepCount()+1);
return dt; return dt;
...@@ -4352,8 +4538,8 @@ double OpenCLIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, ...@@ -4352,8 +4538,8 @@ double OpenCLIntegrateVariableLangevinStepKernel::execute(ContextImpl& context,
class OpenCLIntegrateCustomStepKernel::ReorderListener : public OpenCLContext::ReorderListener { class OpenCLIntegrateCustomStepKernel::ReorderListener : public OpenCLContext::ReorderListener {
public: public:
ReorderListener(OpenCLContext& cl, OpenCLParameterSet& perDofValues, vector<vector<cl_float> >& localPerDofValues, bool& deviceValuesAreCurrent) : ReorderListener(OpenCLContext& cl, OpenCLParameterSet& perDofValues, vector<vector<cl_float> >& localPerDofValuesFloat, vector<vector<cl_double> >& localPerDofValuesDouble, bool& deviceValuesAreCurrent) :
cl(cl), perDofValues(perDofValues), localPerDofValues(localPerDofValues), deviceValuesAreCurrent(deviceValuesAreCurrent) { cl(cl), perDofValues(perDofValues), localPerDofValuesFloat(localPerDofValuesFloat), localPerDofValuesDouble(localPerDofValuesDouble), deviceValuesAreCurrent(deviceValuesAreCurrent) {
int numAtoms = cl.getNumAtoms(); int numAtoms = cl.getNumAtoms();
lastAtomOrder.resize(numAtoms); lastAtomOrder.resize(numAtoms);
for (int i = 0; i < numAtoms; i++) for (int i = 0; i < numAtoms; i++)
...@@ -4365,21 +4551,39 @@ public: ...@@ -4365,21 +4551,39 @@ public:
if (perDofValues.getNumParameters() == 0) if (perDofValues.getNumParameters() == 0)
return; return;
int numAtoms = cl.getNumAtoms(); int numAtoms = cl.getNumAtoms();
if (deviceValuesAreCurrent) const vector<int>& order = cl.getAtomIndex();
perDofValues.getParameterValues(localPerDofValues); if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()) {
vector<vector<cl_float> > swap(3*numAtoms); if (deviceValuesAreCurrent)
for (int i = 0; i < numAtoms; i++) { perDofValues.getParameterValues(localPerDofValuesDouble);
swap[3*lastAtomOrder[i]] = localPerDofValues[3*i]; vector<vector<cl_double> > swap(3*numAtoms);
swap[3*lastAtomOrder[i]+1] = localPerDofValues[3*i+1]; for (int i = 0; i < numAtoms; i++) {
swap[3*lastAtomOrder[i]+2] = localPerDofValues[3*i+2]; swap[3*lastAtomOrder[i]] = localPerDofValuesDouble[3*i];
} swap[3*lastAtomOrder[i]+1] = localPerDofValuesDouble[3*i+1];
const vector<cl_int>& order = cl.getAtomIndex(); swap[3*lastAtomOrder[i]+2] = localPerDofValuesDouble[3*i+2];
for (int i = 0; i < numAtoms; i++) { }
localPerDofValues[3*i] = swap[3*order[i]]; for (int i = 0; i < numAtoms; i++) {
localPerDofValues[3*i+1] = swap[3*order[i]+1]; localPerDofValuesDouble[3*i] = swap[3*order[i]];
localPerDofValues[3*i+2] = swap[3*order[i]+2]; localPerDofValuesDouble[3*i+1] = swap[3*order[i]+1];
} localPerDofValuesDouble[3*i+2] = swap[3*order[i]+2];
perDofValues.setParameterValues(localPerDofValues); }
perDofValues.setParameterValues(localPerDofValuesDouble);
}
else {
if (deviceValuesAreCurrent)
perDofValues.getParameterValues(localPerDofValuesFloat);
vector<vector<cl_float> > swap(3*numAtoms);
for (int i = 0; i < numAtoms; i++) {
swap[3*lastAtomOrder[i]] = localPerDofValuesFloat[3*i];
swap[3*lastAtomOrder[i]+1] = localPerDofValuesFloat[3*i+1];
swap[3*lastAtomOrder[i]+2] = localPerDofValuesFloat[3*i+2];
}
for (int i = 0; i < numAtoms; i++) {
localPerDofValuesFloat[3*i] = swap[3*order[i]];
localPerDofValuesFloat[3*i+1] = swap[3*order[i]+1];
localPerDofValuesFloat[3*i+2] = swap[3*order[i]+2];
}
perDofValues.setParameterValues(localPerDofValuesFloat);
}
for (int i = 0; i < numAtoms; i++) for (int i = 0; i < numAtoms; i++)
lastAtomOrder[i] = order[i]; lastAtomOrder[i] = order[i];
deviceValuesAreCurrent = true; deviceValuesAreCurrent = true;
...@@ -4387,7 +4591,8 @@ public: ...@@ -4387,7 +4591,8 @@ public:
private: private:
OpenCLContext& cl; OpenCLContext& cl;
OpenCLParameterSet& perDofValues; OpenCLParameterSet& perDofValues;
vector<vector<cl_float> >& localPerDofValues; vector<vector<cl_float> >& localPerDofValuesFloat;
vector<vector<cl_double> >& localPerDofValuesDouble;
bool& deviceValuesAreCurrent; bool& deviceValuesAreCurrent;
vector<int> lastAtomOrder; vector<int> lastAtomOrder;
}; };
...@@ -4413,11 +4618,12 @@ void OpenCLIntegrateCustomStepKernel::initialize(const System& system, const Cus ...@@ -4413,11 +4618,12 @@ void OpenCLIntegrateCustomStepKernel::initialize(const System& system, const Cus
cl.getPlatformData().initializeContexts(system); cl.getPlatformData().initializeContexts(system);
cl.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed()); cl.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed());
numGlobalVariables = integrator.getNumGlobalVariables(); numGlobalVariables = integrator.getNumGlobalVariables();
globalValues = OpenCLArray::create<cl_float>(cl, max(1, numGlobalVariables), "globalVariables"); int elementSize = (cl.getUseDoublePrecision() || cl.getUseMixedPrecision() ? sizeof(double) : sizeof(float));
sumBuffer = OpenCLArray::create<cl_float>(cl, 3*system.getNumParticles(), "sumBuffer"); globalValues = new OpenCLArray(cl, max(1, numGlobalVariables), elementSize, "globalVariables");
energy = OpenCLArray::create<cl_float>(cl, 1, "energy"); sumBuffer = new OpenCLArray(cl, 3*system.getNumParticles(), elementSize, "sumBuffer");
perDofValues = new OpenCLParameterSet(cl, integrator.getNumPerDofVariables(), 3*system.getNumParticles(), "perDofVariables"); energy = new OpenCLArray(cl, 1, elementSize, "energy");
cl.addReorderListener(new ReorderListener(cl, *perDofValues, localPerDofValues, deviceValuesAreCurrent)); perDofValues = new OpenCLParameterSet(cl, integrator.getNumPerDofVariables(), 3*system.getNumParticles(), "perDofVariables", false, cl.getUseDoublePrecision() || cl.getUseMixedPrecision());
cl.addReorderListener(new ReorderListener(cl, *perDofValues, localPerDofValuesFloat, localPerDofValuesDouble, deviceValuesAreCurrent));
prevStepSize = -1.0; prevStepSize = -1.0;
SimTKOpenMMUtilities::setRandomNumberSeed(integrator.getRandomNumberSeed()); SimTKOpenMMUtilities::setRandomNumberSeed(integrator.getRandomNumberSeed());
} }
...@@ -4492,19 +4698,31 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr ...@@ -4492,19 +4698,31 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
OpenCLIntegrationUtilities& integration = cl.getIntegrationUtilities(); OpenCLIntegrationUtilities& integration = cl.getIntegrationUtilities();
int numAtoms = cl.getNumAtoms(); int numAtoms = cl.getNumAtoms();
int numSteps = integrator.getNumComputations(); int numSteps = integrator.getNumComputations();
bool useDouble = cl.getUseDoublePrecision() || cl.getUseMixedPrecision();
if (!hasInitializedKernels) { if (!hasInitializedKernels) {
hasInitializedKernels = true; hasInitializedKernels = true;
// Initialize various data structures. // Initialize various data structures.
const map<string, double>& params = context.getParameters(); const map<string, double>& params = context.getParameters();
contextParameterValues = OpenCLArray::create<cl_float>(cl, max(1, (int) params.size()), "contextParameters"); if (useDouble) {
contextValues.resize(contextParameterValues->getSize()); contextParameterValues = OpenCLArray::create<cl_double>(cl, max(1, (int) params.size()), "contextParameters");
for (map<string, double>::const_iterator iter = params.begin(); iter != params.end(); ++iter) { contextValuesDouble.resize(contextParameterValues->getSize());
contextValues[parameterNames.size()] = (float) iter->second; for (map<string, double>::const_iterator iter = params.begin(); iter != params.end(); ++iter) {
parameterNames.push_back(iter->first); contextValuesDouble[parameterNames.size()] = iter->second;
parameterNames.push_back(iter->first);
}
contextParameterValues->upload(contextValuesDouble);
}
else {
contextParameterValues = OpenCLArray::create<cl_float>(cl, max(1, (int) params.size()), "contextParameters");
contextValuesFloat.resize(contextParameterValues->getSize());
for (map<string, double>::const_iterator iter = params.begin(); iter != params.end(); ++iter) {
contextValuesFloat[parameterNames.size()] = (float) iter->second;
parameterNames.push_back(iter->first);
}
contextParameterValues->upload(contextValuesFloat);
} }
contextParameterValues->upload(contextValues);
kernels.resize(integrator.getNumComputations()); kernels.resize(integrator.getNumComputations());
requiredGaussian.resize(integrator.getNumComputations(), 0); requiredGaussian.resize(integrator.getNumComputations(), 0);
requiredUniform.resize(integrator.getNumComputations(), 0); requiredUniform.resize(integrator.getNumComputations(), 0);
...@@ -4644,7 +4862,6 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr ...@@ -4644,7 +4862,6 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
compute << buffer.getType()<<" perDofy"<<intToString(i+1)<<" = perDofValues"<<intToString(i+1)<<"[3*index+1];\n"; compute << buffer.getType()<<" perDofy"<<intToString(i+1)<<" = perDofValues"<<intToString(i+1)<<"[3*index+1];\n";
compute << buffer.getType()<<" perDofz"<<intToString(i+1)<<" = perDofValues"<<intToString(i+1)<<"[3*index+2];\n"; compute << buffer.getType()<<" perDofz"<<intToString(i+1)<<" = perDofValues"<<intToString(i+1)<<"[3*index+2];\n";
} }
string convert = (cl.getSupportsDoublePrecision() ? "convert_float4(" : "(");
int numGaussian = 0, numUniform = 0; int numGaussian = 0, numUniform = 0;
for (int j = step; j < numSteps && (j == step || merged[j]); j++) { for (int j = step; j < numSteps && (j == step || merged[j]); j++) {
compute << "{\n"; compute << "{\n";
...@@ -4653,15 +4870,15 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr ...@@ -4653,15 +4870,15 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
if (variable[j] == "x") { if (variable[j] == "x") {
if (storePosAsDelta[j]) { if (storePosAsDelta[j]) {
if (cl.getSupportsDoublePrecision()) if (cl.getSupportsDoublePrecision())
compute << "posDelta[index] = convert_float4(position-convert_double4(posq[index]));\n"; compute << "posDelta[index] = convert_mixed4(convert_double4(position)-convert_double4(loadPos(posq, posqCorrection, index)));\n";
else else
compute << "posDelta[index] = position-posq[index];\n"; compute << "posDelta[index] = position-posq[index];\n";
} }
else else
compute << "posq[index] = " << convert << "position);\n"; compute << "storePos(posq, posqCorrection, index, position);\n";
} }
else if (variable[j] == "v") else if (variable[j] == "v")
compute << "velm[index] = " << convert << "velocity);\n"; compute << "velm[index] = convert_mixed4(velocity);\n";
else { else {
for (int i = 0; i < (int) perDofValues->getBuffers().size(); i++) { for (int i = 0; i < (int) perDofValues->getBuffers().size(); i++) {
const OpenCLNonbondedUtilities::ParameterInfo& buffer = perDofValues->getBuffers()[i]; const OpenCLNonbondedUtilities::ParameterInfo& buffer = perDofValues->getBuffers()[i];
...@@ -4694,6 +4911,7 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr ...@@ -4694,6 +4911,7 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
requiredUniform[step] = numUniform; requiredUniform[step] = numUniform;
int index = 0; int index = 0;
kernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer()); kernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer());
setPosqCorrectionArg(cl, kernel, index++);
kernel.setArg<cl::Buffer>(index++, integration.getPosDelta().getDeviceBuffer()); kernel.setArg<cl::Buffer>(index++, integration.getPosDelta().getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, cl.getVelm().getDeviceBuffer()); kernel.setArg<cl::Buffer>(index++, cl.getVelm().getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, cl.getForce().getDeviceBuffer()); kernel.setArg<cl::Buffer>(index++, cl.getForce().getDeviceBuffer());
...@@ -4711,7 +4929,7 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr ...@@ -4711,7 +4929,7 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
// Create a second kernel for this step that sums the values. // Create a second kernel for this step that sums the values.
program = cl.createProgram(OpenCLKernelSources::customIntegrator, defines); program = cl.createProgram(OpenCLKernelSources::customIntegrator, defines);
kernel = cl::Kernel(program, "computeSum"); kernel = cl::Kernel(program, useDouble ? "computeDoubleSum" : "computeFloatSum");
kernels[step].push_back(kernel); kernels[step].push_back(kernel);
index = 0; index = 0;
kernel.setArg<cl::Buffer>(index++, sumBuffer->getDeviceBuffer()); kernel.setArg<cl::Buffer>(index++, sumBuffer->getDeviceBuffer());
...@@ -4760,6 +4978,7 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr ...@@ -4760,6 +4978,7 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
kernels[step].push_back(kernel); kernels[step].push_back(kernel);
int index = 0; int index = 0;
kernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer()); kernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer());
setPosqCorrectionArg(cl, kernel, index++);
kernel.setArg<cl::Buffer>(index++, integration.getPosDelta().getDeviceBuffer()); kernel.setArg<cl::Buffer>(index++, integration.getPosDelta().getDeviceBuffer());
} }
} }
...@@ -4767,7 +4986,7 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr ...@@ -4767,7 +4986,7 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
// Create the kernel for summing energy. // Create the kernel for summing energy.
cl::Program program = cl.createProgram(OpenCLKernelSources::customIntegrator, defines); cl::Program program = cl.createProgram(OpenCLKernelSources::customIntegrator, defines);
sumEnergyKernel = cl::Kernel(program, "computeSum"); sumEnergyKernel = cl::Kernel(program, cl.getUseDoublePrecision() ? "computeDoubleSum" : "computeFloatSum");
int index = 0; int index = 0;
sumEnergyKernel.setArg<cl::Buffer>(index++, cl.getEnergyBuffer().getDeviceBuffer()); sumEnergyKernel.setArg<cl::Buffer>(index++, cl.getEnergyBuffer().getDeviceBuffer());
sumEnergyKernel.setArg<cl::Buffer>(index++, energy->getDeviceBuffer()); sumEnergyKernel.setArg<cl::Buffer>(index++, energy->getDeviceBuffer());
...@@ -4778,26 +4997,48 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr ...@@ -4778,26 +4997,48 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
// Make sure all values (variables, parameters, etc.) stored on the device are up to date. // Make sure all values (variables, parameters, etc.) stored on the device are up to date.
if (!deviceValuesAreCurrent) { if (!deviceValuesAreCurrent) {
perDofValues->setParameterValues(localPerDofValues); if (useDouble)
perDofValues->setParameterValues(localPerDofValuesDouble);
else
perDofValues->setParameterValues(localPerDofValuesFloat);
deviceValuesAreCurrent = true; deviceValuesAreCurrent = true;
} }
localValuesAreCurrent = false; localValuesAreCurrent = false;
double stepSize = integrator.getStepSize(); double stepSize = integrator.getStepSize();
if (stepSize != prevStepSize) { if (stepSize != prevStepSize) {
mm_float2 ss = mm_float2(0, (float) stepSize); if (useDouble) {
integration.getStepSize().upload(&ss); mm_double2 ss = mm_double2(0, stepSize);
integration.getStepSize().upload(&ss);
}
else {
mm_float2 ss = mm_float2(0, (float) stepSize);
integration.getStepSize().upload(&ss);
}
prevStepSize = stepSize; prevStepSize = stepSize;
} }
bool paramsChanged = false; bool paramsChanged = false;
for (int i = 0; i < (int) parameterNames.size(); i++) { if (useDouble) {
float value = (float) context.getParameter(parameterNames[i]); for (int i = 0; i < (int) parameterNames.size(); i++) {
if (value != contextValues[i]) { double value = context.getParameter(parameterNames[i]);
contextValues[i] = value; if (value != contextValuesDouble[i]) {
paramsChanged = true; contextValuesDouble[i] = value;
paramsChanged = true;
}
}
if (paramsChanged)
contextParameterValues->upload(contextValuesDouble);
}
else {
for (int i = 0; i < (int) parameterNames.size(); i++) {
float value = (float) context.getParameter(parameterNames[i]);
if (value != contextValuesFloat[i]) {
contextValuesFloat[i] = value;
paramsChanged = true;
}
} }
if (paramsChanged)
contextParameterValues->upload(contextValuesFloat);
} }
if (paramsChanged)
contextParameterValues->upload(contextValues);
// Loop over computation steps in the integrator and execute them. // Loop over computation steps in the integrator and execute them.
...@@ -4826,7 +5067,7 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr ...@@ -4826,7 +5067,7 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
forcesAreValid = true; forcesAreValid = true;
} }
if (stepType[i] == CustomIntegrator::ComputePerDof && !merged[i]) { if (stepType[i] == CustomIntegrator::ComputePerDof && !merged[i]) {
kernels[i][0].setArg<cl_uint>(9, integration.prepareRandomNumbers(requiredGaussian[i])); kernels[i][0].setArg<cl_uint>(10, integration.prepareRandomNumbers(requiredGaussian[i]));
if (requiredUniform[i] > 0) if (requiredUniform[i] > 0)
cl.executeKernel(randomKernel, numAtoms); cl.executeKernel(randomKernel, numAtoms);
cl.executeKernel(kernels[i][0], numAtoms); cl.executeKernel(kernels[i][0], numAtoms);
...@@ -4837,7 +5078,7 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr ...@@ -4837,7 +5078,7 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
cl.executeKernel(kernels[i][0], 1, 1); cl.executeKernel(kernels[i][0], 1, 1);
} }
else if (stepType[i] == CustomIntegrator::ComputeSum) { else if (stepType[i] == CustomIntegrator::ComputeSum) {
kernels[i][0].setArg<cl_uint>(9, integration.prepareRandomNumbers(requiredGaussian[i])); kernels[i][0].setArg<cl_uint>(10, integration.prepareRandomNumbers(requiredGaussian[i]));
if (requiredUniform[i] > 0) if (requiredUniform[i] > 0)
cl.executeKernel(randomKernel, numAtoms); cl.executeKernel(randomKernel, numAtoms);
cl.executeKernel(kernels[i][0], numAtoms); cl.executeKernel(kernels[i][0], numAtoms);
...@@ -4875,11 +5116,21 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr ...@@ -4875,11 +5116,21 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
void OpenCLIntegrateCustomStepKernel::recordChangedParameters(ContextImpl& context) { void OpenCLIntegrateCustomStepKernel::recordChangedParameters(ContextImpl& context) {
if (!modifiesParameters) if (!modifiesParameters)
return; return;
contextParameterValues->download(contextValues); if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()) {
for (int i = 0; i < (int) parameterNames.size(); i++) { contextParameterValues->download(contextValuesDouble);
float value = (float) context.getParameter(parameterNames[i]); for (int i = 0; i < (int) parameterNames.size(); i++) {
if (value != contextValues[i]) double value = context.getParameter(parameterNames[i]);
context.setParameter(parameterNames[i], contextValues[i]); if (value != contextValuesDouble[i])
context.setParameter(parameterNames[i], contextValuesDouble[i]);
}
}
else {
contextParameterValues->download(contextValuesFloat);
for (int i = 0; i < (int) parameterNames.size(); i++) {
float value = (float) context.getParameter(parameterNames[i]);
if (value != contextValuesFloat[i])
context.setParameter(parameterNames[i], contextValuesFloat[i]);
}
} }
} }
...@@ -4888,43 +5139,72 @@ void OpenCLIntegrateCustomStepKernel::getGlobalVariables(ContextImpl& context, v ...@@ -4888,43 +5139,72 @@ void OpenCLIntegrateCustomStepKernel::getGlobalVariables(ContextImpl& context, v
values.resize(0); values.resize(0);
return; return;
} }
vector<cl_float> buffer; if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision())
globalValues->download(buffer); globalValues->download(values);
values.resize(numGlobalVariables); else {
for (int i = 0; i < numGlobalVariables; i++) vector<cl_float> buffer;
values[i] = buffer[i]; globalValues->download(buffer);
for (int i = 0; i < numGlobalVariables; i++)
values[i] = buffer[i];
}
} }
void OpenCLIntegrateCustomStepKernel::setGlobalVariables(ContextImpl& context, const vector<double>& values) { void OpenCLIntegrateCustomStepKernel::setGlobalVariables(ContextImpl& context, const vector<double>& values) {
if (numGlobalVariables == 0) if (numGlobalVariables == 0)
return; return;
vector<cl_float> valuesVec(numGlobalVariables); if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision())
for (int i = 0; i < numGlobalVariables; i++) globalValues->upload(values);
valuesVec[i] = (float) values[i]; else {
globalValues->upload(valuesVec); vector<cl_float> buffer(numGlobalVariables);
for (int i = 0; i < numGlobalVariables; i++)
buffer[i] = (cl_float) values[i];
globalValues->upload(buffer);
}
} }
void OpenCLIntegrateCustomStepKernel::getPerDofVariable(ContextImpl& context, int variable, vector<Vec3>& values) const { void OpenCLIntegrateCustomStepKernel::getPerDofVariable(ContextImpl& context, int variable, vector<Vec3>& values) const {
if (!localValuesAreCurrent) {
perDofValues->getParameterValues(localPerDofValues);
localValuesAreCurrent = true;
}
values.resize(perDofValues->getNumObjects()/3); values.resize(perDofValues->getNumObjects()/3);
const vector<cl_int>& order = cl.getAtomIndex(); const vector<int>& order = cl.getAtomIndex();
for (int i = 0; i < (int) values.size(); i++) if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()) {
for (int j = 0; j < 3; j++) if (!localValuesAreCurrent) {
values[order[i]][j] = localPerDofValues[3*i+j][variable]; perDofValues->getParameterValues(localPerDofValuesDouble);
localValuesAreCurrent = true;
}
for (int i = 0; i < (int) values.size(); i++)
for (int j = 0; j < 3; j++)
values[order[i]][j] = localPerDofValuesDouble[3*i+j][variable];
}
else {
if (!localValuesAreCurrent) {
perDofValues->getParameterValues(localPerDofValuesFloat);
localValuesAreCurrent = true;
}
for (int i = 0; i < (int) values.size(); i++)
for (int j = 0; j < 3; j++)
values[order[i]][j] = localPerDofValuesFloat[3*i+j][variable];
}
} }
void OpenCLIntegrateCustomStepKernel::setPerDofVariable(ContextImpl& context, int variable, const vector<Vec3>& values) { void OpenCLIntegrateCustomStepKernel::setPerDofVariable(ContextImpl& context, int variable, const vector<Vec3>& values) {
if (!localValuesAreCurrent) { const vector<int>& order = cl.getAtomIndex();
perDofValues->getParameterValues(localPerDofValues); if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()) {
localValuesAreCurrent = true; if (!localValuesAreCurrent) {
perDofValues->getParameterValues(localPerDofValuesDouble);
localValuesAreCurrent = true;
}
for (int i = 0; i < (int) values.size(); i++)
for (int j = 0; j < 3; j++)
localPerDofValuesDouble[3*i+j][variable] = values[order[i]][j];
}
else {
if (!localValuesAreCurrent) {
perDofValues->getParameterValues(localPerDofValuesFloat);
localValuesAreCurrent = true;
}
for (int i = 0; i < (int) values.size(); i++)
for (int j = 0; j < 3; j++)
localPerDofValuesFloat[3*i+j][variable] = (float) values[order[i]][j];
} }
const vector<cl_int>& order = cl.getAtomIndex();
for (int i = 0; i < (int) values.size(); i++)
for (int j = 0; j < 3; j++)
localPerDofValues[3*i+j][variable] = (float) values[order[i]][j];
deviceValuesAreCurrent = false; deviceValuesAreCurrent = false;
} }
...@@ -5035,13 +5315,23 @@ double OpenCLCalcKineticEnergyKernel::execute(ContextImpl& context) { ...@@ -5035,13 +5315,23 @@ double OpenCLCalcKineticEnergyKernel::execute(ContextImpl& context) {
// We don't currently have a GPU kernel to do this, so we retrieve the velocities and calculate the energy // We don't currently have a GPU kernel to do this, so we retrieve the velocities and calculate the energy
// on the CPU. // on the CPU.
mm_float4* velm = (mm_float4*) cl.getPinnedBuffer();
cl.getVelm().download(velm);
double energy = 0.0;
const vector<cl_int>& order = cl.getAtomIndex(); const vector<cl_int>& order = cl.getAtomIndex();
for (size_t i = 0; i < masses.size(); ++i) { double energy = 0.0;
mm_float4 v = velm[i]; if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()) {
energy += masses[order[i]]*(v.x*v.x+v.y*v.y+v.z*v.z); mm_double4* velm = (mm_double4*) cl.getPinnedBuffer();
cl.getVelm().download(velm);
for (size_t i = 0; i < masses.size(); ++i) {
mm_double4 v = velm[i];
energy += masses[order[i]]*(v.x*v.x+v.y*v.y+v.z*v.z);
}
}
else {
mm_float4* velm = (mm_float4*) cl.getPinnedBuffer();
cl.getVelm().download(velm);
for (size_t i = 0; i < masses.size(); ++i) {
mm_float4 v = velm[i];
energy += masses[order[i]]*(v.x*v.x+v.y*v.y+v.z*v.z);
}
} }
return 0.5*energy; return 0.5*energy;
} }
......
...@@ -1145,7 +1145,10 @@ private: ...@@ -1145,7 +1145,10 @@ private:
OpenCLArray* uniformRandoms; OpenCLArray* uniformRandoms;
OpenCLArray* randomSeed; OpenCLArray* randomSeed;
OpenCLParameterSet* perDofValues; OpenCLParameterSet* perDofValues;
mutable std::vector<std::vector<cl_float> > localPerDofValues; mutable std::vector<std::vector<cl_float> > localPerDofValuesFloat;
mutable std::vector<std::vector<cl_double> > localPerDofValuesDouble;
std::vector<float> contextValuesFloat;
std::vector<double> contextValuesDouble;
std::vector<float> contextValues; std::vector<float> contextValues;
std::vector<std::vector<cl::Kernel> > kernels; std::vector<std::vector<cl::Kernel> > kernels;
cl::Kernel sumEnergyKernel, randomKernel; cl::Kernel sumEnergyKernel, randomKernel;
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2009 Stanford University and the Authors. * * Portions copyright (c) 2009-2012 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -32,32 +32,34 @@ ...@@ -32,32 +32,34 @@
using namespace OpenMM; using namespace OpenMM;
using namespace std; using namespace std;
OpenCLParameterSet::OpenCLParameterSet(OpenCLContext& context, int numParameters, int numObjects, const string& name, bool bufferPerParameter) : OpenCLParameterSet::OpenCLParameterSet(OpenCLContext& context, int numParameters, int numObjects, const string& name, bool bufferPerParameter, bool useDoublePrecision) :
context(context), numParameters(numParameters), numObjects(numObjects), name(name) { context(context), numParameters(numParameters), numObjects(numObjects), name(name) {
int params = numParameters; int params = numParameters;
int bufferCount = 0; int bufferCount = 0;
elementSize = (useDoublePrecision ? sizeof(double) : sizeof(float));
string elementType = (useDoublePrecision ? "double" : "float");
try { try {
if (!bufferPerParameter) { if (!bufferPerParameter) {
while (params > 2) { while (params > 2) {
cl::Buffer* buf = new cl::Buffer(context.getContext(), CL_MEM_READ_WRITE, numObjects*sizeof(mm_float4)); cl::Buffer* buf = new cl::Buffer(context.getContext(), CL_MEM_READ_WRITE, numObjects*elementSize*4);
std::stringstream name; std::stringstream name;
name << "param" << (++bufferCount); name << "param" << (++bufferCount);
buffers.push_back(OpenCLNonbondedUtilities::ParameterInfo(name.str(), "float", 4, sizeof(mm_float4), *buf)); buffers.push_back(OpenCLNonbondedUtilities::ParameterInfo(name.str(), elementType, 4, elementSize*4, *buf));
params -= 4; params -= 4;
} }
if (params > 1) { if (params > 1) {
cl::Buffer* buf = new cl::Buffer(context.getContext(), CL_MEM_READ_WRITE, numObjects*sizeof(mm_float2)); cl::Buffer* buf = new cl::Buffer(context.getContext(), CL_MEM_READ_WRITE, numObjects*elementSize*2);
std::stringstream name; std::stringstream name;
name << "param" << (++bufferCount); name << "param" << (++bufferCount);
buffers.push_back(OpenCLNonbondedUtilities::ParameterInfo(name.str(), "float", 2, sizeof(mm_float2), *buf)); buffers.push_back(OpenCLNonbondedUtilities::ParameterInfo(name.str(), elementType, 2, elementSize*2, *buf));
params -= 2; params -= 2;
} }
} }
while (params > 0) { while (params > 0) {
cl::Buffer* buf = new cl::Buffer(context.getContext(), CL_MEM_READ_WRITE, numObjects*sizeof(cl_float)); cl::Buffer* buf = new cl::Buffer(context.getContext(), CL_MEM_READ_WRITE, numObjects*elementSize);
std::stringstream name; std::stringstream name;
name << "param" << (++bufferCount); name << "param" << (++bufferCount);
buffers.push_back(OpenCLNonbondedUtilities::ParameterInfo(name.str(), "float", 1, sizeof(cl_float), *buf)); buffers.push_back(OpenCLNonbondedUtilities::ParameterInfo(name.str(), elementType, 1, elementSize, *buf));
params--; params--;
} }
} }
...@@ -73,39 +75,42 @@ OpenCLParameterSet::~OpenCLParameterSet() { ...@@ -73,39 +75,42 @@ OpenCLParameterSet::~OpenCLParameterSet() {
delete &buffers[i].getMemory(); delete &buffers[i].getMemory();
} }
void OpenCLParameterSet::getParameterValues(vector<vector<cl_float> >& values) const { template <class T>
void OpenCLParameterSet::getParameterValues(vector<vector<T> >& values) const {
if (sizeof(T) != elementSize)
throw OpenMMException("Called getParameterValues() with vector of wrong type");
values.resize(numObjects); values.resize(numObjects);
for (int i = 0; i < numObjects; i++) for (int i = 0; i < numObjects; i++)
values[i].resize(numParameters); values[i].resize(numParameters);
try { try {
int base = 0; int base = 0;
for (int i = 0; i < (int) buffers.size(); i++) { for (int i = 0; i < (int) buffers.size(); i++) {
if (buffers[i].getType() == "float4") { if (buffers[i].getSize() == 4*elementSize) {
vector<mm_float4> data(numObjects); vector<T> data(4*numObjects);
context.getQueue().enqueueReadBuffer(reinterpret_cast<cl::Buffer&>(buffers[i].getMemory()), CL_TRUE, 0, numObjects*buffers[i].getSize(), &data[0]); context.getQueue().enqueueReadBuffer(reinterpret_cast<cl::Buffer&>(buffers[i].getMemory()), CL_TRUE, 0, numObjects*buffers[i].getSize(), &data[0]);
for (int j = 0; j < numObjects; j++) { for (int j = 0; j < numObjects; j++) {
values[j][base] = data[j].x; values[j][base] = data[4*j];
if (base+1 < numParameters) if (base+1 < numParameters)
values[j][base+1] = data[j].y; values[j][base+1] = data[4*j+1];
if (base+2 < numParameters) if (base+2 < numParameters)
values[j][base+2] = data[j].z; values[j][base+2] = data[4*j+2];
if (base+3 < numParameters) if (base+3 < numParameters)
values[j][base+3] = data[j].w; values[j][base+3] = data[4*j+3];
} }
base += 4; base += 4;
} }
else if (buffers[i].getType() == "float2") { else if (buffers[i].getSize() == 2*elementSize) {
vector<mm_float2> data(numObjects); vector<T> data(2*numObjects);
context.getQueue().enqueueReadBuffer(reinterpret_cast<cl::Buffer&>(buffers[i].getMemory()), CL_TRUE, 0, numObjects*buffers[i].getSize(), &data[0]); context.getQueue().enqueueReadBuffer(reinterpret_cast<cl::Buffer&>(buffers[i].getMemory()), CL_TRUE, 0, numObjects*buffers[i].getSize(), &data[0]);
for (int j = 0; j < numObjects; j++) { for (int j = 0; j < numObjects; j++) {
values[j][base] = data[j].x; values[j][base] = data[2*j];
if (base+1 < numParameters) if (base+1 < numParameters)
values[j][base+1] = data[j].y; values[j][base+1] = data[2*j+1];
} }
base += 2; base += 2;
} }
else if (buffers[i].getType() == "float") { else if (buffers[i].getSize() == elementSize) {
vector<cl_float> data(numObjects); vector<T> data(numObjects);
context.getQueue().enqueueReadBuffer(reinterpret_cast<cl::Buffer&>(buffers[i].getMemory()), CL_TRUE, 0, numObjects*buffers[i].getSize(), &data[0]); context.getQueue().enqueueReadBuffer(reinterpret_cast<cl::Buffer&>(buffers[i].getMemory()), CL_TRUE, 0, numObjects*buffers[i].getSize(), &data[0]);
for (int j = 0; j < numObjects; j++) for (int j = 0; j < numObjects; j++)
values[j][base] = data[j]; values[j][base] = data[j];
...@@ -122,36 +127,39 @@ void OpenCLParameterSet::getParameterValues(vector<vector<cl_float> >& values) c ...@@ -122,36 +127,39 @@ void OpenCLParameterSet::getParameterValues(vector<vector<cl_float> >& values) c
} }
} }
void OpenCLParameterSet::setParameterValues(const vector<vector<cl_float> >& values) { template <class T>
void OpenCLParameterSet::setParameterValues(const vector<vector<T> >& values) {
if (sizeof(T) != elementSize)
throw OpenMMException("Called setParameterValues() with vector of wrong type");
try { try {
int base = 0; int base = 0;
for (int i = 0; i < (int) buffers.size(); i++) { for (int i = 0; i < (int) buffers.size(); i++) {
if (buffers[i].getType() == "float4") { if (buffers[i].getSize() == 4*elementSize) {
vector<mm_float4> data(numObjects); vector<T> data(4*numObjects);
for (int j = 0; j < numObjects; j++) { for (int j = 0; j < numObjects; j++) {
data[j].x = values[j][base]; data[4*j] = values[j][base];
if (base+1 < numParameters) if (base+1 < numParameters)
data[j].y = values[j][base+1]; data[4*j+1] = values[j][base+1];
if (base+2 < numParameters) if (base+2 < numParameters)
data[j].z = values[j][base+2]; data[4*j+2] = values[j][base+2];
if (base+3 < numParameters) if (base+3 < numParameters)
data[j].w = values[j][base+3]; data[4*j+3] = values[j][base+3];
} }
context.getQueue().enqueueWriteBuffer(reinterpret_cast<cl::Buffer&>(buffers[i].getMemory()), CL_TRUE, 0, numObjects*buffers[i].getSize(), &data[0]); context.getQueue().enqueueWriteBuffer(reinterpret_cast<cl::Buffer&>(buffers[i].getMemory()), CL_TRUE, 0, numObjects*buffers[i].getSize(), &data[0]);
base += 4; base += 4;
} }
else if (buffers[i].getType() == "float2") { else if (buffers[i].getSize() == 2*elementSize) {
vector<mm_float2> data(numObjects); vector<T> data(2*numObjects);
for (int j = 0; j < numObjects; j++) { for (int j = 0; j < numObjects; j++) {
data[j].x = values[j][base]; data[2*j] = values[j][base];
if (base+1 < numParameters) if (base+1 < numParameters)
data[j].y = values[j][base+1]; data[2*j+1] = values[j][base+1];
} }
context.getQueue().enqueueWriteBuffer(reinterpret_cast<cl::Buffer&>(buffers[i].getMemory()), CL_TRUE, 0, numObjects*buffers[i].getSize(), &data[0]); context.getQueue().enqueueWriteBuffer(reinterpret_cast<cl::Buffer&>(buffers[i].getMemory()), CL_TRUE, 0, numObjects*buffers[i].getSize(), &data[0]);
base += 2; base += 2;
} }
else if (buffers[i].getType() == "float") { else if (buffers[i].getSize() == elementSize) {
vector<cl_float> data(numObjects); vector<T> data(numObjects);
for (int j = 0; j < numObjects; j++) for (int j = 0; j < numObjects; j++)
data[j] = values[j][base]; data[j] = values[j][base];
context.getQueue().enqueueWriteBuffer(reinterpret_cast<cl::Buffer&>(buffers[i].getMemory()), CL_TRUE, 0, numObjects*buffers[i].getSize(), &data[0]); context.getQueue().enqueueWriteBuffer(reinterpret_cast<cl::Buffer&>(buffers[i].getMemory()), CL_TRUE, 0, numObjects*buffers[i].getSize(), &data[0]);
...@@ -172,16 +180,26 @@ string OpenCLParameterSet::getParameterSuffix(int index, const std::string& extr ...@@ -172,16 +180,26 @@ string OpenCLParameterSet::getParameterSuffix(int index, const std::string& extr
const string suffixes[] = {".x", ".y", ".z", ".w"}; const string suffixes[] = {".x", ".y", ".z", ".w"};
int buffer = -1; int buffer = -1;
for (int i = 0; buffer == -1 && i < (int) buffers.size(); i++) { for (int i = 0; buffer == -1 && i < (int) buffers.size(); i++) {
if (index*sizeof(cl_float) < buffers[i].getSize()) if (index*elementSize < buffers[i].getSize())
buffer = i; buffer = i;
else else
index -= buffers[i].getSize()/sizeof(cl_float); index -= buffers[i].getSize()/elementSize;
} }
if (buffer == -1) if (buffer == -1)
throw OpenMMException("Internal error: Illegal argument to OpenCLParameterSet::getParameterSuffix() ("+name+")"); throw OpenMMException("Internal error: Illegal argument to OpenCLParameterSet::getParameterSuffix() ("+name+")");
stringstream suffix; stringstream suffix;
suffix << (buffer+1) << extraSuffix; suffix << (buffer+1) << extraSuffix;
if (buffers[buffer].getType() != "float") if (buffers[buffer].getSize() != elementSize)
suffix << suffixes[index]; suffix << suffixes[index];
return suffix.str(); return suffix.str();
} }
/**
* Define template instantiations for float and double versions of getParameterValues() and setParameterValues().
*/
namespace OpenMM {
template void OpenCLParameterSet::getParameterValues<float>(vector<vector<float> >& values) const;
template void OpenCLParameterSet::setParameterValues<float>(const vector<vector<float> >& values);
template void OpenCLParameterSet::getParameterValues<double>(vector<vector<double> >& values) const;
template void OpenCLParameterSet::setParameterValues<double>(const vector<vector<double> >& values);
}
\ No newline at end of file
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2009 Stanford University and the Authors. * * Portions copyright (c) 2009-2012 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -51,8 +51,9 @@ public: ...@@ -51,8 +51,9 @@ public:
* @param name the name of the parameter set * @param name the name of the parameter set
* @param bufferPerParameter if true, a separate cl::Buffer is created for each parameter. If false, * @param bufferPerParameter if true, a separate cl::Buffer is created for each parameter. If false,
* multiple parameters may be combined into a single buffer. * multiple parameters may be combined into a single buffer.
* @param useDoublePrecision whether values should be stored as single or double precision
*/ */
OpenCLParameterSet(OpenCLContext& context, int numParameters, int numObjects, const std::string& name, bool bufferPerParameter=false); OpenCLParameterSet(OpenCLContext& context, int numParameters, int numObjects, const std::string& name, bool bufferPerParameter=false, bool useDoublePrecision=false);
~OpenCLParameterSet(); ~OpenCLParameterSet();
/** /**
* Get the number of parameters. * Get the number of parameters.
...@@ -71,13 +72,15 @@ public: ...@@ -71,13 +72,15 @@ public:
* *
* @param values on exit, values[i][j] contains the value of parameter j for object i * @param values on exit, values[i][j] contains the value of parameter j for object i
*/ */
void getParameterValues(std::vector<std::vector<cl_float> >& values) const; template <class T>
void getParameterValues(std::vector<std::vector<T> >& values) const;
/** /**
* Set the values of all parameters. * Set the values of all parameters.
* *
* @param values values[i][j] contains the value of parameter j for object i * @param values values[i][j] contains the value of parameter j for object i
*/ */
void setParameterValues(const std::vector<std::vector<cl_float> >& values); template <class T>
void setParameterValues(const std::vector<std::vector<T> >& values);
/** /**
* Get a set of OpenCLNonbondedUtilities::ParameterInfo objects which describe the Buffers * Get a set of OpenCLNonbondedUtilities::ParameterInfo objects which describe the Buffers
* containing the data. * containing the data.
...@@ -95,8 +98,7 @@ public: ...@@ -95,8 +98,7 @@ public:
std::string getParameterSuffix(int index, const std::string& extraSuffix = "") const; std::string getParameterSuffix(int index, const std::string& extraSuffix = "") const;
private: private:
OpenCLContext& context; OpenCLContext& context;
int numParameters; int numParameters, numObjects, elementSize;
int numObjects;
std::string name; std::string name;
std::vector<OpenCLNonbondedUtilities::ParameterInfo> buffers; std::vector<OpenCLNonbondedUtilities::ParameterInfo> buffers;
}; };
......
...@@ -76,8 +76,10 @@ OpenCLPlatform::OpenCLPlatform() { ...@@ -76,8 +76,10 @@ OpenCLPlatform::OpenCLPlatform() {
registerKernelFactory(RemoveCMMotionKernel::Name(), factory); registerKernelFactory(RemoveCMMotionKernel::Name(), factory);
platformProperties.push_back(OpenCLDeviceIndex()); platformProperties.push_back(OpenCLDeviceIndex());
platformProperties.push_back(OpenCLPlatformIndex()); platformProperties.push_back(OpenCLPlatformIndex());
platformProperties.push_back(OpenCLPrecision());
setPropertyDefaultValue(OpenCLDeviceIndex(), ""); setPropertyDefaultValue(OpenCLDeviceIndex(), "");
setPropertyDefaultValue(OpenCLPlatformIndex(), ""); setPropertyDefaultValue(OpenCLPlatformIndex(), "");
setPropertyDefaultValue(OpenCLPrecision(), "single");
} }
bool OpenCLPlatform::supportsDoublePrecision() const { bool OpenCLPlatform::supportsDoublePrecision() const {
...@@ -101,7 +103,9 @@ void OpenCLPlatform::contextCreated(ContextImpl& context, const map<string, stri ...@@ -101,7 +103,9 @@ void OpenCLPlatform::contextCreated(ContextImpl& context, const map<string, stri
getPropertyDefaultValue(OpenCLPlatformIndex()) : properties.find(OpenCLPlatformIndex())->second); getPropertyDefaultValue(OpenCLPlatformIndex()) : properties.find(OpenCLPlatformIndex())->second);
const string& devicePropValue = (properties.find(OpenCLDeviceIndex()) == properties.end() ? const string& devicePropValue = (properties.find(OpenCLDeviceIndex()) == properties.end() ?
getPropertyDefaultValue(OpenCLDeviceIndex()) : properties.find(OpenCLDeviceIndex())->second); getPropertyDefaultValue(OpenCLDeviceIndex()) : properties.find(OpenCLDeviceIndex())->second);
context.setPlatformData(new PlatformData(context.getSystem(), platformPropValue, devicePropValue)); string precisionPropValue = (properties.find(OpenCLPrecision()) == properties.end() ?
getPropertyDefaultValue(OpenCLPrecision()) : properties.find(OpenCLPrecision())->second);
context.setPlatformData(new PlatformData(context.getSystem(), platformPropValue, devicePropValue, precisionPropValue));
} }
void OpenCLPlatform::contextDestroyed(ContextImpl& context) const { void OpenCLPlatform::contextDestroyed(ContextImpl& context) const {
...@@ -109,7 +113,8 @@ void OpenCLPlatform::contextDestroyed(ContextImpl& context) const { ...@@ -109,7 +113,8 @@ void OpenCLPlatform::contextDestroyed(ContextImpl& context) const {
delete data; delete data;
} }
OpenCLPlatform::PlatformData::PlatformData(const System& system, const string& platformPropValue, const string& deviceIndexProperty) : removeCM(false), stepCount(0), computeForceCount(0), time(0.0) { OpenCLPlatform::PlatformData::PlatformData(const System& system, const string& platformPropValue, const string& deviceIndexProperty,
const string& precisionProperty) : removeCM(false), stepCount(0), computeForceCount(0), time(0.0) {
int platformIndex = 0; int platformIndex = 0;
if (platformPropValue.length() > 0) if (platformPropValue.length() > 0)
stringstream(platformPropValue) >> platformIndex; stringstream(platformPropValue) >> platformIndex;
...@@ -124,11 +129,11 @@ OpenCLPlatform::PlatformData::PlatformData(const System& system, const string& p ...@@ -124,11 +129,11 @@ OpenCLPlatform::PlatformData::PlatformData(const System& system, const string& p
if (devices[i].length() > 0) { if (devices[i].length() > 0) {
unsigned int deviceIndex; unsigned int deviceIndex;
stringstream(devices[i]) >> deviceIndex; stringstream(devices[i]) >> deviceIndex;
contexts.push_back(new OpenCLContext(system, platformIndex, deviceIndex, *this)); contexts.push_back(new OpenCLContext(system, platformIndex, deviceIndex, precisionProperty, *this));
} }
} }
if (contexts.size() == 0) if (contexts.size() == 0)
contexts.push_back(new OpenCLContext(system, platformIndex, -1, *this)); contexts.push_back(new OpenCLContext(system, platformIndex, -1, precisionProperty, *this));
stringstream device; stringstream device;
for (int i = 0; i < (int) contexts.size(); i++) { for (int i = 0; i < (int) contexts.size(); i++) {
if (i > 0) if (i > 0)
...@@ -137,6 +142,7 @@ OpenCLPlatform::PlatformData::PlatformData(const System& system, const string& p ...@@ -137,6 +142,7 @@ OpenCLPlatform::PlatformData::PlatformData(const System& system, const string& p
} }
propertyValues[OpenCLPlatform::OpenCLDeviceIndex()] = device.str(); propertyValues[OpenCLPlatform::OpenCLDeviceIndex()] = device.str();
propertyValues[OpenCLPlatform::OpenCLPlatformIndex()] = OpenCLExpressionUtilities::intToString(platformIndex); propertyValues[OpenCLPlatform::OpenCLPlatformIndex()] = OpenCLExpressionUtilities::intToString(platformIndex);
propertyValues[OpenCLPlatform::OpenCLPrecision()] = precisionProperty;
contextEnergy.resize(contexts.size()); contextEnergy.resize(contexts.size());
} }
......
...@@ -2,17 +2,19 @@ ...@@ -2,17 +2,19 @@
* Apply the Andersen thermostat to adjust particle velocities. * Apply the Andersen thermostat to adjust particle velocities.
*/ */
__kernel void applyAndersenThermostat(float collisionFrequency, float kT, __global float4* velm, __global const float2* restrict stepSize, __global const float4* restrict random, __kernel void applyAndersenThermostat(float collisionFrequency, float kT, __global mixed4* velm, __global const mixed2* restrict stepSize, __global const float4* restrict random,
unsigned int randomIndex, __global const int* restrict atomGroups) { unsigned int randomIndex, __global const int* restrict atomGroups) {
float collisionProbability = 1.0f-exp(-collisionFrequency*stepSize[0].y); float collisionProbability = 1.0f-exp(-collisionFrequency*stepSize[0].y);
float randomRange = erf(collisionProbability/sqrt(2.0f)); float randomRange = erf(collisionProbability/sqrt(2.0f));
for (int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) { for (int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) {
float4 velocity = velm[index]; mixed4 velocity = velm[index];
float4 selectRand = random[randomIndex+atomGroups[index]]; float4 selectRand = random[randomIndex+atomGroups[index]];
float4 velRand = random[randomIndex+index]; float4 velRand = random[randomIndex+index];
float scale = (selectRand.w > -randomRange && selectRand.w < randomRange ? 0.0f : 1.0f); real scale = (selectRand.w > -randomRange && selectRand.w < randomRange ? 0 : 1);
float add = (1.0f-scale)*sqrt(kT*velocity.w); real add = (1-scale)*sqrt(kT*velocity.w);
velocity.xyz = scale*velocity.xyz + add*velRand.xyz; velocity.x = scale*velocity.x + add*velRand.x;
velocity.y = scale*velocity.y + add*velRand.y;
velocity.z = scale*velocity.z + add*velRand.z;
velm[index] = velocity; velm[index] = velocity;
} }
} }
...@@ -2,13 +2,16 @@ ...@@ -2,13 +2,16 @@
* Perform the first step of Brownian integration. * Perform the first step of Brownian integration.
*/ */
__kernel void integrateBrownianPart1(float tauDeltaT, float noiseAmplitude, __global const float4* restrict force, __kernel void integrateBrownianPart1(mixed tauDeltaT, mixed noiseAmplitude, __global const real4* restrict force,
__global float4* restrict posDelta, __global const float4* restrict velm, __global const float4* restrict random, unsigned int randomIndex) { __global mixed4* restrict posDelta, __global const mixed4* restrict velm, __global const float4* restrict random, unsigned int randomIndex) {
randomIndex += get_global_id(0); randomIndex += get_global_id(0);
for (int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) { for (int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) {
float invMass = velm[index].w; mixed invMass = velm[index].w;
if (invMass != 0.0) if (invMass != 0) {
posDelta[index] = (float4) (tauDeltaT*invMass*force[index].xyz + noiseAmplitude*sqrt(invMass)*random[randomIndex].xyz, 0.0f); posDelta[index] = (mixed4) (tauDeltaT*invMass*force[index].x + noiseAmplitude*sqrt(invMass)*random[randomIndex].x,
tauDeltaT*invMass*force[index].y + noiseAmplitude*sqrt(invMass)*random[randomIndex].y,
tauDeltaT*invMass*force[index].z + noiseAmplitude*sqrt(invMass)*random[randomIndex].z, 0);
}
randomIndex += get_global_size(0); randomIndex += get_global_size(0);
} }
} }
...@@ -17,12 +20,29 @@ __kernel void integrateBrownianPart1(float tauDeltaT, float noiseAmplitude, __gl ...@@ -17,12 +20,29 @@ __kernel void integrateBrownianPart1(float tauDeltaT, float noiseAmplitude, __gl
* Perform the second step of Brownian integration. * Perform the second step of Brownian integration.
*/ */
__kernel void integrateBrownianPart2(float oneOverDeltaT, __global float4* posq, __global float4* velm, __global const float4* restrict posDelta) { __kernel void integrateBrownianPart2(mixed oneOverDeltaT, __global real4* posq, __global real4* posqCorrection, __global mixed4* velm, __global const mixed4* restrict posDelta) {
for (int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) { for (int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) {
if (velm[index].w != 0.0) { if (velm[index].w != 0) {
float4 delta = posDelta[index]; mixed4 delta = posDelta[index];
velm[index].xyz = oneOverDeltaT*delta.xyz; velm[index].x = oneOverDeltaT*delta.x;
posq[index].xyz = posq[index].xyz + delta.xyz; velm[index].y = oneOverDeltaT*delta.y;
velm[index].z = oneOverDeltaT*delta.z;
#ifdef USE_MIXED_PRECISION
real4 pos1 = posq[index];
real4 pos2 = posqCorrection[index];
mixed4 pos = (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
#else
real4 pos = posq[index];
#endif
pos.x += delta.x;
pos.y += delta.y;
pos.z += delta.z;
#ifdef USE_MIXED_PRECISION
posq[index] = (real4) ((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
posqCorrection[index] = (real4) (pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
#else
posq[index] = pos;
#endif
} }
} }
} }
mixed4 loadPos(__global const real4* restrict posq, __global const real4* restrict posqCorrection, int index) {
#ifdef USE_MIXED_PRECISION
real4 pos1 = posq[index];
real4 pos2 = posqCorrection[index];
return (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
#else
return posq[index];
#endif
}
/** /**
* Compute the direction each constraint is pointing in. This is called once at the beginning of constraint evaluation. * Compute the direction each constraint is pointing in. This is called once at the beginning of constraint evaluation.
*/ */
__kernel void computeConstraintDirections(__global const int2* restrict constraintAtoms, __global float4* restrict constraintDistance, __global const float4* restrict atomPositions) { __kernel void computeConstraintDirections(__global const int2* restrict constraintAtoms, __global mixed4* restrict constraintDistance, __global const real4* restrict atomPositions, __global const real4* restrict posCorrection) {
for (int index = get_global_id(0); index < NUM_CONSTRAINTS; index += get_global_size(0)) { for (int index = get_global_id(0); index < NUM_CONSTRAINTS; index += get_global_size(0)) {
// Compute the direction for this constraint. // Compute the direction for this constraint.
int2 atoms = constraintAtoms[index]; int2 atoms = constraintAtoms[index];
float4 dir = constraintDistance[index]; mixed4 dir = constraintDistance[index];
float4 oldPos1 = atomPositions[atoms.x]; mixed4 oldPos1 = loadPos(atomPositions, posCorrection, atoms.x);
float4 oldPos2 = atomPositions[atoms.y]; mixed4 oldPos2 = loadPos(atomPositions, posCorrection, atoms.y);
dir.x = oldPos1.x-oldPos2.x; dir.x = oldPos1.x-oldPos2.x;
dir.y = oldPos1.y-oldPos2.y; dir.y = oldPos1.y-oldPos2.y;
dir.z = oldPos1.z-oldPos2.z; dir.z = oldPos1.z-oldPos2.z;
...@@ -19,8 +28,8 @@ __kernel void computeConstraintDirections(__global const int2* restrict constrai ...@@ -19,8 +28,8 @@ __kernel void computeConstraintDirections(__global const int2* restrict constrai
/** /**
* Compute the force applied by each constraint. * Compute the force applied by each constraint.
*/ */
__kernel void computeConstraintForce(__global const int2* restrict constraintAtoms, __global const float4* restrict constraintDistance, __global const float4* restrict atomPositions, __kernel void computeConstraintForce(__global const int2* restrict constraintAtoms, __global const mixed4* restrict constraintDistance, __global const mixed4* restrict atomPositions,
__global const float* restrict reducedMass, __global float* restrict delta1, __global int* restrict converged, float tol, int iteration) { __global const mixed* restrict reducedMass, __global mixed* restrict delta1, __global int* restrict converged, mixed tol, int iteration) {
__local int groupConverged; __local int groupConverged;
if (converged[1-iteration%2]) { if (converged[1-iteration%2]) {
if (get_global_id(0) == 0) if (get_global_id(0) == 0)
...@@ -30,21 +39,21 @@ __kernel void computeConstraintForce(__global const int2* restrict constraintAto ...@@ -30,21 +39,21 @@ __kernel void computeConstraintForce(__global const int2* restrict constraintAto
if (get_local_id(0) == 0) if (get_local_id(0) == 0)
groupConverged = 1; groupConverged = 1;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
float lowerTol = 1.0f-2.0f*tol+tol*tol; mixed lowerTol = 1-2*tol+tol*tol;
float upperTol = 1.0f+2.0f*tol+tol*tol; mixed upperTol = 1+2*tol+tol*tol;
for (int index = get_global_id(0); index < NUM_CONSTRAINTS; index += get_global_size(0)) { for (int index = get_global_id(0); index < NUM_CONSTRAINTS; index += get_global_size(0)) {
// Compute the force due to this constraint. // Compute the force due to this constraint.
int2 atoms = constraintAtoms[index]; int2 atoms = constraintAtoms[index];
float4 dir = constraintDistance[index]; mixed4 dir = constraintDistance[index];
float4 rp_ij = atomPositions[atoms.x]-atomPositions[atoms.y]; mixed4 rp_ij = atomPositions[atoms.x]-atomPositions[atoms.y];
#ifndef CONSTRAIN_VELOCITIES #ifndef CONSTRAIN_VELOCITIES
rp_ij.xyz += dir.xyz; rp_ij.xyz += dir.xyz;
#endif #endif
float rrpr = rp_ij.x*dir.x + rp_ij.y*dir.y + rp_ij.z*dir.z; mixed rrpr = rp_ij.x*dir.x + rp_ij.y*dir.y + rp_ij.z*dir.z;
float d_ij2 = dir.x*dir.x + dir.y*dir.y + dir.z*dir.z; mixed d_ij2 = dir.x*dir.x + dir.y*dir.y + dir.z*dir.z;
#ifdef CONSTRAIN_VELOCITIES #ifdef CONSTRAIN_VELOCITIES
delta1[index] = -2.0f*reducedMass[index]*rrpr/d_ij2; delta1[index] = -2*reducedMass[index]*rrpr/d_ij2;
// See whether it has converged. // See whether it has converged.
...@@ -53,9 +62,9 @@ __kernel void computeConstraintForce(__global const int2* restrict constraintAto ...@@ -53,9 +62,9 @@ __kernel void computeConstraintForce(__global const int2* restrict constraintAto
converged[iteration%2] = 0; converged[iteration%2] = 0;
} }
#else #else
float rp2 = rp_ij.x*rp_ij.x + rp_ij.y*rp_ij.y + rp_ij.z*rp_ij.z; mixed rp2 = rp_ij.x*rp_ij.x + rp_ij.y*rp_ij.y + rp_ij.z*rp_ij.z;
float dist2 = dir.w*dir.w; mixed dist2 = dir.w*dir.w;
float diff = dist2 - rp2; mixed diff = dist2 - rp2;
delta1[index] = (rrpr > d_ij2*1e-6f ? reducedMass[index]*diff/rrpr : 0.0f); delta1[index] = (rrpr > d_ij2*1e-6f ? reducedMass[index]*diff/rrpr : 0.0f);
// See whether it has converged. // See whether it has converged.
...@@ -71,15 +80,15 @@ __kernel void computeConstraintForce(__global const int2* restrict constraintAto ...@@ -71,15 +80,15 @@ __kernel void computeConstraintForce(__global const int2* restrict constraintAto
/** /**
* Multiply the vector of constraint forces by the constraint matrix. * Multiply the vector of constraint forces by the constraint matrix.
*/ */
__kernel void multiplyByConstraintMatrix(__global const float* restrict delta1, __global float* restrict delta2, __global const int* restrict constraintMatrixColumn, __kernel void multiplyByConstraintMatrix(__global const mixed* restrict delta1, __global mixed* restrict delta2, __global const int* restrict constraintMatrixColumn,
__global const float* restrict constraintMatrixValue, __global const int* restrict converged, int iteration) { __global const mixed* restrict constraintMatrixValue, __global const int* restrict converged, int iteration) {
if (converged[iteration%2]) if (converged[iteration%2])
return; // The constraint iteration has already converged. return; // The constraint iteration has already converged.
// Multiply by the inverse constraint matrix. // Multiply by the inverse constraint matrix.
for (int index = get_global_id(0); index < NUM_CONSTRAINTS; index += get_global_size(0)) { for (int index = get_global_id(0); index < NUM_CONSTRAINTS; index += get_global_size(0)) {
float sum = 0.0f; mixed sum = 0;
for (int i = 0; ; i++) { for (int i = 0; ; i++) {
int element = index+i*NUM_CONSTRAINTS; int element = index+i*NUM_CONSTRAINTS;
int column = constraintMatrixColumn[element]; int column = constraintMatrixColumn[element];
...@@ -94,26 +103,26 @@ __kernel void multiplyByConstraintMatrix(__global const float* restrict delta1, ...@@ -94,26 +103,26 @@ __kernel void multiplyByConstraintMatrix(__global const float* restrict delta1,
/** /**
* Update the atom positions based on constraint forces. * Update the atom positions based on constraint forces.
*/ */
__kernel void updateAtomPositions(__global const int* restrict numAtomConstraints, __global const int* restrict atomConstraints, __global const float4* restrict constraintDistance, __kernel void updateAtomPositions(__global const int* restrict numAtomConstraints, __global const int* restrict atomConstraints, __global const mixed4* restrict constraintDistance,
__global float4* restrict atomPositions, __global const float4* restrict velm, __global const float* restrict delta1, __global const float* restrict delta2, __global int* restrict converged, int iteration) { __global mixed4* restrict atomPositions, __global const mixed4* restrict velm, __global const mixed* restrict delta1, __global const mixed* restrict delta2, __global int* restrict converged, int iteration) {
if (get_global_id(0) == 0) if (get_global_id(0) == 0)
converged[1-iteration%2] = 1; converged[1-iteration%2] = 1;
if (converged[iteration%2]) if (converged[iteration%2])
return; // The constraint iteration has already converged. return; // The constraint iteration has already converged.
float damping = (iteration < 2 ? 0.5f : 1.0f); mixed damping = (iteration < 2 ? 0.5f : 1.0f);
for (int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) { for (int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) {
// Compute the new position of this atom. // Compute the new position of this atom.
float4 atomPos = atomPositions[index]; mixed4 atomPos = atomPositions[index];
float invMass = velm[index].w; mixed invMass = velm[index].w;
int num = numAtomConstraints[index]; int num = numAtomConstraints[index];
for (int i = 0; i < num; i++) { for (int i = 0; i < num; i++) {
int constraint = atomConstraints[index+i*NUM_ATOMS]; int constraint = atomConstraints[index+i*NUM_ATOMS];
bool forward = (constraint > 0); bool forward = (constraint > 0);
constraint = (forward ? constraint-1 : -constraint-1); constraint = (forward ? constraint-1 : -constraint-1);
float constraintForce = damping*invMass*delta2[constraint]; mixed constraintForce = damping*invMass*delta2[constraint];
constraintForce = (forward ? constraintForce : -constraintForce); constraintForce = (forward ? constraintForce : -constraintForce);
float4 dir = constraintDistance[constraint]; mixed4 dir = constraintDistance[constraint];
atomPos.x += constraintForce*dir.x; atomPos.x += constraintForce*dir.x;
atomPos.y += constraintForce*dir.y; atomPos.y += constraintForce*dir.y;
atomPos.z += constraintForce*dir.z; atomPos.z += constraintForce*dir.z;
......
__kernel void applyPositionDeltas(__global float4* restrict posq, __global float4* restrict posDelta) { __kernel void applyPositionDeltas(__global real4* restrict posq, __global real4* restrict posqCorrection, __global mixed4* restrict posDelta) {
for (unsigned int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) { for (unsigned int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) {
float4 position = posq[index]; #ifdef USE_MIXED_PRECISION
position.xyz += posDelta[index].xyz; real4 pos1 = posq[index];
posq[index] = position; real4 pos2 = posqCorrection[index];
mixed4 pos = (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
#else
mixed4 pos = posq[index];
#endif
pos.xyz += posDelta[index].xyz;
#ifdef USE_MIXED_PRECISION
posq[index] = (real4) ((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
posqCorrection[index] = (real4) (pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
#else
posq[index] = pos;
#endif
} }
} }
__kernel void computeSum(__global const float* restrict sumBuffer, __global float* result, unsigned int outputIndex, int bufferSize) { __kernel void computeFloatSum(__global const float* restrict sumBuffer, __global float* result, unsigned int outputIndex, int bufferSize) {
__local float tempBuffer[WORK_GROUP_SIZE]; __local float tempBuffer[WORK_GROUP_SIZE];
const unsigned int thread = get_local_id(0); const unsigned int thread = get_local_id(0);
float sum = 0.0f; float sum = 0;
for (unsigned int index = thread; index < bufferSize; index += get_local_size(0)) for (unsigned int index = thread; index < bufferSize; index += get_local_size(0))
sum += sumBuffer[index]; sum += sumBuffer[index];
tempBuffer[thread] = sum; tempBuffer[thread] = sum;
...@@ -14,12 +14,41 @@ __kernel void computeSum(__global const float* restrict sumBuffer, __global floa ...@@ -14,12 +14,41 @@ __kernel void computeSum(__global const float* restrict sumBuffer, __global floa
result[outputIndex] = tempBuffer[0]; result[outputIndex] = tempBuffer[0];
} }
__kernel void applyPositionDeltas(__global float4* restrict posq, __global float4* restrict posDelta) { #ifdef SUPPORTS_DOUBLE_PRECISION
__kernel void computeDoubleSum(__global const double* restrict sumBuffer, __global double* result, unsigned int outputIndex, int bufferSize) {
__local double tempBuffer[WORK_GROUP_SIZE];
const unsigned int thread = get_local_id(0);
double sum = 0;
for (unsigned int index = thread; index < bufferSize; index += get_local_size(0))
sum += sumBuffer[index];
tempBuffer[thread] = sum;
for (int i = 1; i < WORK_GROUP_SIZE; i *= 2) {
barrier(CLK_LOCAL_MEM_FENCE);
if (thread%(i*2) == 0 && thread+i < WORK_GROUP_SIZE)
tempBuffer[thread] += tempBuffer[thread+i];
}
if (thread == 0)
result[outputIndex] = tempBuffer[0];
}
#endif
__kernel void applyPositionDeltas(__global real4* restrict posq, __global real4* restrict posqCorrection, __global mixed4* restrict posDelta) {
for (unsigned int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) { for (unsigned int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) {
float4 position = posq[index]; #ifdef USE_MIXED_PRECISION
position.xyz += posDelta[index].xyz; real4 pos1 = posq[index];
posq[index] = position; real4 pos2 = posqCorrection[index];
posDelta[index] = (float4) 0.0f; mixed4 pos = (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
#else
real4 pos = posq[index];
#endif
pos.xyz += posDelta[index].xyz;
#ifdef USE_MIXED_PRECISION
posq[index] = (real4) ((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
posqCorrection[index] = (real4) (pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
#else
posq[index] = pos;
#endif
posDelta[index] = (mixed4) 0;
} }
} }
......
__kernel void computeGlobal(__global float2* restrict dt, __global float* restrict globals, __global float* restrict params, __kernel void computeGlobal(__global mixed2* restrict dt, __global mixed* restrict globals, __global mixed* restrict params,
float uniform, float gaussian, __global const float* restrict energy) { float uniform, float gaussian, __global const real* restrict energy) {
COMPUTE_STEP COMPUTE_STEP
} }
#ifdef SUPPORTS_DOUBLE_PRECISION /**
#pragma OPENCL EXTENSION cl_khr_fp64 : enable * Load the position of a particle.
*/
mixed4 loadPos(__global const real4* restrict posq, __global const real4* restrict posqCorrection, int index) {
#ifdef USE_MIXED_PRECISION
real4 pos1 = posq[index];
real4 pos2 = posqCorrection[index];
return (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
#else
return posq[index];
#endif
}
/**
* Store the position of a particle.
*/
void storePos(__global real4* restrict posq, __global real4* restrict posqCorrection, int index, mixed4 pos) {
#ifdef USE_MIXED_PRECISION
posq[index] = (real4) ((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
posqCorrection[index] = (real4) (pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
#else
posq[index] = pos;
#endif #endif
}
__kernel void computePerDof(__global float4* restrict posq, __global float4* restrict posDelta, __global float4* restrict velm, __kernel void computePerDof(__global real4* restrict posq, __global real4* restrict posqCorrection, __global mixed4* restrict posDelta,
__global const float4* restrict force, __global const float2* restrict dt, __global const float* restrict globals, __global mixed4* restrict velm, __global const real4* restrict force, __global const mixed2* restrict dt, __global const mixed* restrict globals,
__global const float* restrict params, __global float* restrict sum, __global const float4* restrict gaussianValues, __global const mixed* restrict params, __global mixed* restrict sum, __global const float4* restrict gaussianValues,
unsigned int randomIndex, __global const float4* restrict uniformValues, __global const float* restrict energy unsigned int randomIndex, __global const float4* restrict uniformValues, __global const real* restrict energy
PARAMETER_ARGUMENTS) { PARAMETER_ARGUMENTS) {
float stepSize = dt[0].y; mixed stepSize = dt[0].y;
int index = get_global_id(0); int index = get_global_id(0);
randomIndex += index; randomIndex += index;
while (index < NUM_ATOMS) { while (index < NUM_ATOMS) {
#ifdef SUPPORTS_DOUBLE_PRECISION
#ifdef LOAD_POS_AS_DELTA #ifdef LOAD_POS_AS_DELTA
double4 position = convert_double4(posq[index]+posDelta[index]); mixed4 position = loadPos(posq, posqCorrection, index)+posDelta[index];
#else #else
double4 position = convert_double4(posq[index]); mixed4 position = loadPos(posq, posqCorrection, index);
#endif
double4 velocity = convert_double4(velm[index]);
double4 f = convert_double4(force[index]);
double mass = 1.0/velocity.w;
#else
#ifdef LOAD_POS_AS_DELTA
float4 position = posq[index]+posDelta[index];
#else
float4 position = posq[index];
#endif
float4 velocity = velm[index];
float4 f = force[index];
float mass = 1.0f/velocity.w;
#endif #endif
mixed4 velocity = velm[index];
real4 f = force[index];
mixed mass = 1/velocity.w;
if (velocity.w != 0.0) { if (velocity.w != 0.0) {
float4 gaussian = gaussianValues[randomIndex]; float4 gaussian = gaussianValues[randomIndex];
float4 uniform = uniformValues[index]; float4 uniform = uniformValues[index];
......
float2 multofFloat2(float2 a, float2 b) { float2 multofFloat2(float2 a, float2 b) {
return (float2) (a.x*b.x - a.y*b.y, a.x*b.y + a.y*b.x); return (float2) (a.x*b.x - a.y*b.y, a.x*b.y + a.y*b.x);
} }
......
#ifdef SUPPORTS_DOUBLE_PRECISION
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#endif
enum {VelScale, ForceScale, NoiseScale, MaxParams}; enum {VelScale, ForceScale, NoiseScale, MaxParams};
/** /**
* Perform the first step of Langevin integration. * Perform the first step of Langevin integration.
*/ */
__kernel void integrateLangevinPart1(__global float4* restrict velm, __global const float4* restrict force, __global float4* restrict posDelta, __kernel void integrateLangevinPart1(__global mixed4* restrict velm, __global const real4* restrict force, __global mixed4* restrict posDelta,
__global const float* restrict paramBuffer, __global const float2* restrict dt, __global const float4* restrict random, unsigned int randomIndex) { __global const mixed* restrict paramBuffer, __global const mixed2* restrict dt, __global const float4* restrict random, unsigned int randomIndex) {
float vscale = paramBuffer[VelScale]; mixed vscale = paramBuffer[VelScale];
float fscale = paramBuffer[ForceScale]; mixed fscale = paramBuffer[ForceScale];
float noisescale = paramBuffer[NoiseScale]; mixed noisescale = paramBuffer[NoiseScale];
float stepSize = dt[0].y; mixed stepSize = dt[0].y;
int index = get_global_id(0); int index = get_global_id(0);
randomIndex += index; randomIndex += index;
while (index < NUM_ATOMS) { while (index < NUM_ATOMS) {
float4 velocity = velm[index]; mixed4 velocity = velm[index];
if (velocity.w != 0.0) { if (velocity.w != 0.0) {
float sqrtInvMass = sqrt(velocity.w); mixed sqrtInvMass = sqrt(velocity.w);
velocity.xyz = vscale*velocity.xyz + fscale*velocity.w*force[index].xyz + noisescale*sqrtInvMass*random[randomIndex].xyz; velocity.x = vscale*velocity.x + fscale*velocity.w*force[index].x + noisescale*sqrtInvMass*random[randomIndex].x;
velocity.y = vscale*velocity.y + fscale*velocity.w*force[index].y + noisescale*sqrtInvMass*random[randomIndex].y;
velocity.z = vscale*velocity.z + fscale*velocity.w*force[index].z + noisescale*sqrtInvMass*random[randomIndex].z;
velm[index] = velocity; velm[index] = velocity;
posDelta[index] = stepSize*velocity; posDelta[index] = stepSize*velocity;
} }
...@@ -33,7 +31,7 @@ __kernel void integrateLangevinPart1(__global float4* restrict velm, __global co ...@@ -33,7 +31,7 @@ __kernel void integrateLangevinPart1(__global float4* restrict velm, __global co
* Perform the second step of Langevin integration. * Perform the second step of Langevin integration.
*/ */
__kernel void integrateLangevinPart2(__global float4* restrict posq, __global const float4* restrict posDelta, __global float4* restrict velm, __global const float2* restrict dt) { __kernel void integrateLangevinPart2(__global real4* restrict posq, __global real4* restrict posqCorrection, __global const mixed4* restrict posDelta, __global mixed4* restrict velm, __global const mixed2* restrict dt) {
#ifdef SUPPORTS_DOUBLE_PRECISION #ifdef SUPPORTS_DOUBLE_PRECISION
double invStepSize = 1.0/dt[0].y; double invStepSize = 1.0/dt[0].y;
#else #else
...@@ -41,17 +39,28 @@ __kernel void integrateLangevinPart2(__global float4* restrict posq, __global co ...@@ -41,17 +39,28 @@ __kernel void integrateLangevinPart2(__global float4* restrict posq, __global co
#endif #endif
int index = get_global_id(0); int index = get_global_id(0);
while (index < NUM_ATOMS) { while (index < NUM_ATOMS) {
float4 vel = velm[index]; mixed4 vel = velm[index];
if (vel.w != 0.0) { if (vel.w != 0.0) {
float4 pos = posq[index]; #ifdef USE_MIXED_PRECISION
float4 delta = posDelta[index]; real4 pos1 = posq[index];
real4 pos2 = posqCorrection[index];
mixed4 pos = (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
#else
real4 pos = posq[index];
#endif
mixed4 delta = posDelta[index];
pos.xyz += delta.xyz; pos.xyz += delta.xyz;
#ifdef SUPPORTS_DOUBLE_PRECISION #ifdef SUPPORTS_DOUBLE_PRECISION
vel.xyz = convert_float4(invStepSize*convert_double4(delta)).xyz; vel.xyz = convert_mixed4(invStepSize*convert_double4(delta)).xyz;
#else #else
vel.xyz = invStepSize*delta.xyz; vel.xyz = invStepSize*delta.xyz;
#endif #endif
#ifdef USE_MIXED_PRECISION
posq[index] = convert_real4(pos);
posqCorrection[index] = (real4) (pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
#else
posq[index] = pos; posq[index] = pos;
#endif
velm[index] = vel; velm[index] = vel;
} }
index += get_global_size(0); index += get_global_size(0);
...@@ -62,15 +71,15 @@ __kernel void integrateLangevinPart2(__global float4* restrict posq, __global co ...@@ -62,15 +71,15 @@ __kernel void integrateLangevinPart2(__global float4* restrict posq, __global co
* Select the step size to use for the next step. * Select the step size to use for the next step.
*/ */
__kernel void selectLangevinStepSize(float maxStepSize, float errorTol, float tau, float kT, __global float2* restrict dt, __kernel void selectLangevinStepSize(mixed maxStepSize, mixed errorTol, mixed tau, mixed kT, __global mixed2* restrict dt,
__global const float4* restrict velm, __global const float4* restrict force, __global float* restrict paramBuffer, __local float* restrict params, __local float* restrict error) { __global const mixed4* restrict velm, __global const real4* restrict force, __global mixed* restrict paramBuffer, __local mixed* restrict params, __local mixed* restrict error) {
// Calculate the error. // Calculate the error.
float err = 0.0f; mixed err = 0.0f;
unsigned int index = get_local_id(0); unsigned int index = get_local_id(0);
while (index < NUM_ATOMS) { while (index < NUM_ATOMS) {
float4 f = force[index]; real4 f = force[index];
float invMass = velm[index].w; mixed invMass = velm[index].w;
err += (f.x*f.x + f.y*f.y + f.z*f.z)*invMass; err += (f.x*f.x + f.y*f.y + f.z*f.z)*invMass;
index += get_global_size(0); index += get_global_size(0);
} }
...@@ -87,9 +96,9 @@ __kernel void selectLangevinStepSize(float maxStepSize, float errorTol, float ta ...@@ -87,9 +96,9 @@ __kernel void selectLangevinStepSize(float maxStepSize, float errorTol, float ta
if (get_global_id(0) == 0) { if (get_global_id(0) == 0) {
// Select the new step size. // Select the new step size.
float totalError = sqrt(error[0]/(NUM_ATOMS*3)); mixed totalError = sqrt(error[0]/(NUM_ATOMS*3));
float newStepSize = sqrt(errorTol/totalError); mixed newStepSize = sqrt(errorTol/totalError);
float oldStepSize = dt[0].y; mixed oldStepSize = dt[0].y;
if (oldStepSize > 0.0f) if (oldStepSize > 0.0f)
newStepSize = min(newStepSize, oldStepSize*2.0f); // For safety, limit how quickly dt can increase. newStepSize = min(newStepSize, oldStepSize*2.0f); // For safety, limit how quickly dt can increase.
if (newStepSize > oldStepSize && newStepSize < 1.1f*oldStepSize) if (newStepSize > oldStepSize && newStepSize < 1.1f*oldStepSize)
...@@ -100,9 +109,9 @@ __kernel void selectLangevinStepSize(float maxStepSize, float errorTol, float ta ...@@ -100,9 +109,9 @@ __kernel void selectLangevinStepSize(float maxStepSize, float errorTol, float ta
// Recalculate the integration parameters. // Recalculate the integration parameters.
float vscale = exp(-newStepSize/tau); mixed vscale = exp(-newStepSize/tau);
float fscale = (1-vscale)*tau; mixed fscale = (1-vscale)*tau;
float noisescale = sqrt(2*kT/tau)*sqrt(0.5f*(1-vscale*vscale)*tau); mixed noisescale = sqrt(2*kT/tau)*sqrt(0.5f*(1-vscale*vscale)*tau);
params[VelScale] = vscale; params[VelScale] = vscale;
params[ForceScale] = fscale; params[ForceScale] = fscale;
params[NoiseScale] = noisescale; params[NoiseScale] = noisescale;
......
...@@ -2,13 +2,16 @@ ...@@ -2,13 +2,16 @@
* Calculate the center of mass momentum. * Calculate the center of mass momentum.
*/ */
__kernel void calcCenterOfMassMomentum(int numAtoms, __global const float4* restrict velm, __global float4* restrict cmMomentum, __local volatile float4* restrict temp) { __kernel void calcCenterOfMassMomentum(int numAtoms, __global const mixed4* restrict velm, __global float4* restrict cmMomentum, __local volatile float4* restrict temp) {
int index = get_global_id(0); int index = get_global_id(0);
float4 cm = 0.0f; float4 cm = 0.0f;
while (index < numAtoms) { while (index < numAtoms) {
float4 velocity = velm[index]; mixed4 velocity = velm[index];
if (velocity.w != 0.0) if (velocity.w != 0) {
cm.xyz += velocity.xyz/velocity.w; cm.x += velocity.x/velocity.w;
cm.y += velocity.y/velocity.w;
cm.z += velocity.z/velocity.w;
}
index += get_global_size(0); index += get_global_size(0);
} }
...@@ -54,7 +57,7 @@ __kernel void calcCenterOfMassMomentum(int numAtoms, __global const float4* rest ...@@ -54,7 +57,7 @@ __kernel void calcCenterOfMassMomentum(int numAtoms, __global const float4* rest
* Remove center of mass motion. * Remove center of mass motion.
*/ */
__kernel void removeCenterOfMassMomentum(unsigned int numAtoms, __global float4* restrict velm, __global const float4* restrict cmMomentum, __local volatile float4* restrict temp) { __kernel void removeCenterOfMassMomentum(unsigned int numAtoms, __global mixed4* restrict velm, __global const float4* restrict cmMomentum, __local volatile float4* restrict temp) {
// First sum all of the momenta that were calculated by individual groups. // First sum all of the momenta that were calculated by individual groups.
unsigned int index = get_local_id(0); unsigned int index = get_local_id(0);
...@@ -101,7 +104,9 @@ __kernel void removeCenterOfMassMomentum(unsigned int numAtoms, __global float4* ...@@ -101,7 +104,9 @@ __kernel void removeCenterOfMassMomentum(unsigned int numAtoms, __global float4*
index = get_global_id(0); index = get_global_id(0);
while (index < numAtoms) { while (index < numAtoms) {
velm[index].xyz -= cm.xyz; velm[index].x -= cm.x;
velm[index].y -= cm.y;
velm[index].z -= cm.z;
index += get_global_size(0); index += get_global_size(0);
} }
} }
mixed4 loadPos(__global const real4* restrict posq, __global const real4* restrict posqCorrection, int index) {
#ifdef USE_MIXED_PRECISION
real4 pos1 = posq[index];
real4 pos2 = posqCorrection[index];
return (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
#else
return posq[index];
#endif
}
/** /**
* Enforce constraints on SETTLE clusters * Enforce constraints on SETTLE clusters
*/ */
__kernel void applySettle(int numClusters, float tol, __global const float4* restrict oldPos, __global float4* restrict posDelta, __global const float4* restrict velm, __global const int4* restrict clusterAtoms, __global const float2* restrict clusterParams) { __kernel void applySettle(int numClusters, mixed tol, __global const real4* restrict oldPos, __global const real4* restrict posCorrection, __global mixed4* restrict posDelta, __global const mixed4* restrict velm, __global const int4* restrict clusterAtoms, __global const float2* restrict clusterParams) {
int index = get_global_id(0); int index = get_global_id(0);
while (index < numClusters) { while (index < numClusters) {
// Load the data for this cluster. // Load the data for this cluster.
int4 atoms = clusterAtoms[index]; int4 atoms = clusterAtoms[index];
float2 params = clusterParams[index]; float2 params = clusterParams[index];
float4 apos0 = oldPos[atoms.x]; mixed4 apos0 = loadPos(oldPos, posCorrection, atoms.x);
float4 xp0 = posDelta[atoms.x]; mixed4 xp0 = posDelta[atoms.x];
float4 apos1 = oldPos[atoms.y]; mixed4 apos1 = loadPos(oldPos, posCorrection, atoms.y);
float4 xp1 = posDelta[atoms.y]; mixed4 xp1 = posDelta[atoms.y];
float4 apos2 = oldPos[atoms.z]; mixed4 apos2 = loadPos(oldPos, posCorrection, atoms.z);
float4 xp2 = posDelta[atoms.z]; mixed4 xp2 = posDelta[atoms.z];
float m0 = RECIP(velm[atoms.x].w); mixed m0 = 1/velm[atoms.x].w;
float m1 = RECIP(velm[atoms.y].w); mixed m1 = 1/velm[atoms.y].w;
float m2 = RECIP(velm[atoms.z].w); mixed m2 = 1/velm[atoms.z].w;
// Apply the SETTLE algorithm. // Apply the SETTLE algorithm.
float xb0 = apos1.x-apos0.x; mixed xb0 = apos1.x-apos0.x;
float yb0 = apos1.y-apos0.y; mixed yb0 = apos1.y-apos0.y;
float zb0 = apos1.z-apos0.z; mixed zb0 = apos1.z-apos0.z;
float xc0 = apos2.x-apos0.x; mixed xc0 = apos2.x-apos0.x;
float yc0 = apos2.y-apos0.y; mixed yc0 = apos2.y-apos0.y;
float zc0 = apos2.z-apos0.z; mixed zc0 = apos2.z-apos0.z;
float invTotalMass = 1.0f/(m0+m1+m2); mixed invTotalMass = 1.0f/(m0+m1+m2);
float xcom = (xp0.x*m0 + (xb0+xp1.x)*m1 + (xc0+xp2.x)*m2) * invTotalMass; mixed xcom = (xp0.x*m0 + (xb0+xp1.x)*m1 + (xc0+xp2.x)*m2) * invTotalMass;
float ycom = (xp0.y*m0 + (yb0+xp1.y)*m1 + (yc0+xp2.y)*m2) * invTotalMass; mixed ycom = (xp0.y*m0 + (yb0+xp1.y)*m1 + (yc0+xp2.y)*m2) * invTotalMass;
float zcom = (xp0.z*m0 + (zb0+xp1.z)*m1 + (zc0+xp2.z)*m2) * invTotalMass; mixed zcom = (xp0.z*m0 + (zb0+xp1.z)*m1 + (zc0+xp2.z)*m2) * invTotalMass;
float xa1 = xp0.x - xcom; mixed xa1 = xp0.x - xcom;
float ya1 = xp0.y - ycom; mixed ya1 = xp0.y - ycom;
float za1 = xp0.z - zcom; mixed za1 = xp0.z - zcom;
float xb1 = xb0 + xp1.x - xcom; mixed xb1 = xb0 + xp1.x - xcom;
float yb1 = yb0 + xp1.y - ycom; mixed yb1 = yb0 + xp1.y - ycom;
float zb1 = zb0 + xp1.z - zcom; mixed zb1 = zb0 + xp1.z - zcom;
float xc1 = xc0 + xp2.x - xcom; mixed xc1 = xc0 + xp2.x - xcom;
float yc1 = yc0 + xp2.y - ycom; mixed yc1 = yc0 + xp2.y - ycom;
float zc1 = zc0 + xp2.z - zcom; mixed zc1 = zc0 + xp2.z - zcom;
float xaksZd = yb0*zc0 - zb0*yc0; mixed xaksZd = yb0*zc0 - zb0*yc0;
float yaksZd = zb0*xc0 - xb0*zc0; mixed yaksZd = zb0*xc0 - xb0*zc0;
float zaksZd = xb0*yc0 - yb0*xc0; mixed zaksZd = xb0*yc0 - yb0*xc0;
float xaksXd = ya1*zaksZd - za1*yaksZd; mixed xaksXd = ya1*zaksZd - za1*yaksZd;
float yaksXd = za1*xaksZd - xa1*zaksZd; mixed yaksXd = za1*xaksZd - xa1*zaksZd;
float zaksXd = xa1*yaksZd - ya1*xaksZd; mixed zaksXd = xa1*yaksZd - ya1*xaksZd;
float xaksYd = yaksZd*zaksXd - zaksZd*yaksXd; mixed xaksYd = yaksZd*zaksXd - zaksZd*yaksXd;
float yaksYd = zaksZd*xaksXd - xaksZd*zaksXd; mixed yaksYd = zaksZd*xaksXd - xaksZd*zaksXd;
float zaksYd = xaksZd*yaksXd - yaksZd*xaksXd; mixed zaksYd = xaksZd*yaksXd - yaksZd*xaksXd;
float axlng = sqrt(xaksXd*xaksXd + yaksXd*yaksXd + zaksXd*zaksXd); mixed axlng = sqrt(xaksXd*xaksXd + yaksXd*yaksXd + zaksXd*zaksXd);
float aylng = sqrt(xaksYd*xaksYd + yaksYd*yaksYd + zaksYd*zaksYd); mixed aylng = sqrt(xaksYd*xaksYd + yaksYd*yaksYd + zaksYd*zaksYd);
float azlng = sqrt(xaksZd*xaksZd + yaksZd*yaksZd + zaksZd*zaksZd); mixed azlng = sqrt(xaksZd*xaksZd + yaksZd*yaksZd + zaksZd*zaksZd);
float trns11 = xaksXd / axlng; mixed trns11 = xaksXd / axlng;
float trns21 = yaksXd / axlng; mixed trns21 = yaksXd / axlng;
float trns31 = zaksXd / axlng; mixed trns31 = zaksXd / axlng;
float trns12 = xaksYd / aylng; mixed trns12 = xaksYd / aylng;
float trns22 = yaksYd / aylng; mixed trns22 = yaksYd / aylng;
float trns32 = zaksYd / aylng; mixed trns32 = zaksYd / aylng;
float trns13 = xaksZd / azlng; mixed trns13 = xaksZd / azlng;
float trns23 = yaksZd / azlng; mixed trns23 = yaksZd / azlng;
float trns33 = zaksZd / azlng; mixed trns33 = zaksZd / azlng;
float xb0d = trns11*xb0 + trns21*yb0 + trns31*zb0; mixed xb0d = trns11*xb0 + trns21*yb0 + trns31*zb0;
float yb0d = trns12*xb0 + trns22*yb0 + trns32*zb0; mixed yb0d = trns12*xb0 + trns22*yb0 + trns32*zb0;
float xc0d = trns11*xc0 + trns21*yc0 + trns31*zc0; mixed xc0d = trns11*xc0 + trns21*yc0 + trns31*zc0;
float yc0d = trns12*xc0 + trns22*yc0 + trns32*zc0; mixed yc0d = trns12*xc0 + trns22*yc0 + trns32*zc0;
float za1d = trns13*xa1 + trns23*ya1 + trns33*za1; mixed za1d = trns13*xa1 + trns23*ya1 + trns33*za1;
float xb1d = trns11*xb1 + trns21*yb1 + trns31*zb1; mixed xb1d = trns11*xb1 + trns21*yb1 + trns31*zb1;
float yb1d = trns12*xb1 + trns22*yb1 + trns32*zb1; mixed yb1d = trns12*xb1 + trns22*yb1 + trns32*zb1;
float zb1d = trns13*xb1 + trns23*yb1 + trns33*zb1; mixed zb1d = trns13*xb1 + trns23*yb1 + trns33*zb1;
float xc1d = trns11*xc1 + trns21*yc1 + trns31*zc1; mixed xc1d = trns11*xc1 + trns21*yc1 + trns31*zc1;
float yc1d = trns12*xc1 + trns22*yc1 + trns32*zc1; mixed yc1d = trns12*xc1 + trns22*yc1 + trns32*zc1;
float zc1d = trns13*xc1 + trns23*yc1 + trns33*zc1; mixed zc1d = trns13*xc1 + trns23*yc1 + trns33*zc1;
// --- Step2 A2' --- // --- Step2 A2' ---
float rc = 0.5*params.y; float rc = 0.5*params.y;
float rb = sqrt(params.x*params.x-rc*rc); mixed rb = sqrt(params.x*params.x-rc*rc);
float ra = rb*(m1+m2)*invTotalMass; mixed ra = rb*(m1+m2)*invTotalMass;
rb -= ra; rb -= ra;
float sinphi = za1d / ra; mixed sinphi = za1d / ra;
float cosphi = sqrt(1.0f - sinphi*sinphi); mixed cosphi = sqrt(1.0f - sinphi*sinphi);
float sinpsi = (zb1d - zc1d) / (2*rc*cosphi); mixed sinpsi = (zb1d - zc1d) / (2*rc*cosphi);
float cospsi = sqrt(1.0f - sinpsi*sinpsi); mixed cospsi = sqrt(1.0f - sinpsi*sinpsi);
float ya2d = ra*cosphi; mixed ya2d = ra*cosphi;
float xb2d = - rc*cospsi; mixed xb2d = - rc*cospsi;
float yb2d = - rb*cosphi - rc*sinpsi*sinphi; mixed yb2d = - rb*cosphi - rc*sinpsi*sinphi;
float yc2d = - rb*cosphi + rc*sinpsi*sinphi; mixed yc2d = - rb*cosphi + rc*sinpsi*sinphi;
float xb2d2 = xb2d*xb2d; mixed xb2d2 = xb2d*xb2d;
float hh2 = 4.0f*xb2d2 + (yb2d-yc2d)*(yb2d-yc2d) + (zb1d-zc1d)*(zb1d-zc1d); mixed hh2 = 4.0f*xb2d2 + (yb2d-yc2d)*(yb2d-yc2d) + (zb1d-zc1d)*(zb1d-zc1d);
float deltx = 2.0f*xb2d + sqrt(4.0f*xb2d2 - hh2 + params.y*params.y); mixed deltx = 2.0f*xb2d + sqrt(4.0f*xb2d2 - hh2 + params.y*params.y);
xb2d -= deltx*0.5; xb2d -= deltx*0.5;
// --- Step3 al,be,ga --- // --- Step3 al,be,ga ---
float alpha = (xb2d*(xb0d-xc0d) + yb0d*yb2d + yc0d*yc2d); mixed alpha = (xb2d*(xb0d-xc0d) + yb0d*yb2d + yc0d*yc2d);
float beta = (xb2d*(yc0d-yb0d) + xb0d*yb2d + xc0d*yc2d); mixed beta = (xb2d*(yc0d-yb0d) + xb0d*yb2d + xc0d*yc2d);
float gamma = xb0d*yb1d - xb1d*yb0d + xc0d*yc1d - xc1d*yc0d; mixed gamma = xb0d*yb1d - xb1d*yb0d + xc0d*yc1d - xc1d*yc0d;
float al2be2 = alpha*alpha + beta*beta; mixed al2be2 = alpha*alpha + beta*beta;
float sintheta = (alpha*gamma - beta*sqrt(al2be2 - gamma*gamma)) / al2be2; mixed sintheta = (alpha*gamma - beta*sqrt(al2be2 - gamma*gamma)) / al2be2;
// --- Step4 A3' --- // --- Step4 A3' ---
float costheta = sqrt(1.0f - sintheta*sintheta); mixed costheta = sqrt(1.0f - sintheta*sintheta);
float xa3d = - ya2d*sintheta; mixed xa3d = - ya2d*sintheta;
float ya3d = ya2d*costheta; mixed ya3d = ya2d*costheta;
float za3d = za1d; mixed za3d = za1d;
float xb3d = xb2d*costheta - yb2d*sintheta; mixed xb3d = xb2d*costheta - yb2d*sintheta;
float yb3d = xb2d*sintheta + yb2d*costheta; mixed yb3d = xb2d*sintheta + yb2d*costheta;
float zb3d = zb1d; mixed zb3d = zb1d;
float xc3d = - xb2d*costheta - yc2d*sintheta; mixed xc3d = - xb2d*costheta - yc2d*sintheta;
float yc3d = - xb2d*sintheta + yc2d*costheta; mixed yc3d = - xb2d*sintheta + yc2d*costheta;
float zc3d = zc1d; mixed zc3d = zc1d;
// --- Step5 A3 --- // --- Step5 A3 ---
float xa3 = trns11*xa3d + trns12*ya3d + trns13*za3d; mixed xa3 = trns11*xa3d + trns12*ya3d + trns13*za3d;
float ya3 = trns21*xa3d + trns22*ya3d + trns23*za3d; mixed ya3 = trns21*xa3d + trns22*ya3d + trns23*za3d;
float za3 = trns31*xa3d + trns32*ya3d + trns33*za3d; mixed za3 = trns31*xa3d + trns32*ya3d + trns33*za3d;
float xb3 = trns11*xb3d + trns12*yb3d + trns13*zb3d; mixed xb3 = trns11*xb3d + trns12*yb3d + trns13*zb3d;
float yb3 = trns21*xb3d + trns22*yb3d + trns23*zb3d; mixed yb3 = trns21*xb3d + trns22*yb3d + trns23*zb3d;
float zb3 = trns31*xb3d + trns32*yb3d + trns33*zb3d; mixed zb3 = trns31*xb3d + trns32*yb3d + trns33*zb3d;
float xc3 = trns11*xc3d + trns12*yc3d + trns13*zc3d; mixed xc3 = trns11*xc3d + trns12*yc3d + trns13*zc3d;
float yc3 = trns21*xc3d + trns22*yc3d + trns23*zc3d; mixed yc3 = trns21*xc3d + trns22*yc3d + trns23*zc3d;
float zc3 = trns31*xc3d + trns32*yc3d + trns33*zc3d; mixed zc3 = trns31*xc3d + trns32*yc3d + trns33*zc3d;
xp0.x = xcom + xa3; xp0.x = xcom + xa3;
xp0.y = ycom + ya3; xp0.y = ycom + ya3;
...@@ -155,49 +165,49 @@ __kernel void applySettle(int numClusters, float tol, __global const float4* res ...@@ -155,49 +165,49 @@ __kernel void applySettle(int numClusters, float tol, __global const float4* res
* Enforce velocity constraints on SETTLE clusters * Enforce velocity constraints on SETTLE clusters
*/ */
__kernel void constrainVelocities(int numClusters, float tol, __global const float4* restrict oldPos, __global float4* restrict posDelta, __global float4* restrict velm, __global const int4* restrict clusterAtoms, __global const float2* restrict clusterParams) { __kernel void constrainVelocities(int numClusters, mixed tol, __global const real4* restrict oldPos, __global const real4* restrict posCorrection, __global mixed4* restrict posDelta, __global mixed4* restrict velm, __global const int4* restrict clusterAtoms, __global const float2* restrict clusterParams) {
for (int index = get_global_id(0); index < numClusters; index += get_global_size(0)) { for (int index = get_global_id(0); index < numClusters; index += get_global_size(0)) {
// Load the data for this cluster. // Load the data for this cluster.
int4 atoms = clusterAtoms[index]; int4 atoms = clusterAtoms[index];
float4 apos0 = oldPos[atoms.x]; mixed4 apos0 = loadPos(oldPos, posCorrection, atoms.x);
float4 apos1 = oldPos[atoms.y]; mixed4 apos1 = loadPos(oldPos, posCorrection, atoms.y);
float4 apos2 = oldPos[atoms.z]; mixed4 apos2 = loadPos(oldPos, posCorrection, atoms.z);
float4 v0 = velm[atoms.x]; mixed4 v0 = velm[atoms.x];
float4 v1 = velm[atoms.y]; mixed4 v1 = velm[atoms.y];
float4 v2 = velm[atoms.z]; mixed4 v2 = velm[atoms.z];
// Compute intermediate quantities: the atom masses, the bond directions, the relative velocities, // Compute intermediate quantities: the atom masses, the bond directions, the relative velocities,
// and the angle cosines and sines. // and the angle cosines and sines.
float mA = RECIP(v0.w); mixed mA = 1/v0.w;
float mB = RECIP(v1.w); mixed mB = 1/v1.w;
float mC = RECIP(v2.w); mixed mC = 1/v2.w;
float4 eAB = apos1-apos0; mixed4 eAB = apos1-apos0;
float4 eBC = apos2-apos1; mixed4 eBC = apos2-apos1;
float4 eCA = apos0-apos2; mixed4 eCA = apos0-apos2;
eAB.xyz /= SQRT(eAB.x*eAB.x + eAB.y*eAB.y + eAB.z*eAB.z); eAB.xyz /= sqrt(eAB.x*eAB.x + eAB.y*eAB.y + eAB.z*eAB.z);
eBC.xyz /= SQRT(eBC.x*eBC.x + eBC.y*eBC.y + eBC.z*eBC.z); eBC.xyz /= sqrt(eBC.x*eBC.x + eBC.y*eBC.y + eBC.z*eBC.z);
eCA.xyz /= SQRT(eCA.x*eCA.x + eCA.y*eCA.y + eCA.z*eCA.z); eCA.xyz /= sqrt(eCA.x*eCA.x + eCA.y*eCA.y + eCA.z*eCA.z);
float vAB = (v1.x-v0.x)*eAB.x + (v1.y-v0.y)*eAB.y + (v1.z-v0.z)*eAB.z; mixed vAB = (v1.x-v0.x)*eAB.x + (v1.y-v0.y)*eAB.y + (v1.z-v0.z)*eAB.z;
float vBC = (v2.x-v1.x)*eBC.x + (v2.y-v1.y)*eBC.y + (v2.z-v1.z)*eBC.z; mixed vBC = (v2.x-v1.x)*eBC.x + (v2.y-v1.y)*eBC.y + (v2.z-v1.z)*eBC.z;
float vCA = (v0.x-v2.x)*eCA.x + (v0.y-v2.y)*eCA.y + (v0.z-v2.z)*eCA.z; mixed vCA = (v0.x-v2.x)*eCA.x + (v0.y-v2.y)*eCA.y + (v0.z-v2.z)*eCA.z;
float cA = -(eAB.x*eCA.x + eAB.y*eCA.y + eAB.z*eCA.z); mixed cA = -(eAB.x*eCA.x + eAB.y*eCA.y + eAB.z*eCA.z);
float cB = -(eAB.x*eBC.x + eAB.y*eBC.y + eAB.z*eBC.z); mixed cB = -(eAB.x*eBC.x + eAB.y*eBC.y + eAB.z*eBC.z);
float cC = -(eBC.x*eCA.x + eBC.y*eCA.y + eBC.z*eCA.z); mixed cC = -(eBC.x*eCA.x + eBC.y*eCA.y + eBC.z*eCA.z);
float s2A = 1-cA*cA; mixed s2A = 1-cA*cA;
float s2B = 1-cB*cB; mixed s2B = 1-cB*cB;
float s2C = 1-cC*cC; mixed s2C = 1-cC*cC;
// Solve the equations. These are different from those in the SETTLE paper (JCC 13(8), pp. 952-962, 1992), because // Solve the equations. These are different from those in the SETTLE paper (JCC 13(8), pp. 952-962, 1992), because
// in going from equations B1 to B2, they make the assumption that mB=mC (but don't bother to mention they're // in going from equations B1 to B2, they make the assumption that mB=mC (but don't bother to mention they're
// making that assumption). We allow all three atoms to have different masses. // making that assumption). We allow all three atoms to have different masses.
float mABCinv = RECIP(mA*mB*mC); mixed mABCinv = 1/(mA*mB*mC);
float denom = (((s2A*mB+s2B*mA)*mC+(s2A*mB*mB+2*(cA*cB*cC+1)*mA*mB+s2B*mA*mA))*mC+s2C*mA*mB*(mA+mB))*mABCinv; mixed denom = (((s2A*mB+s2B*mA)*mC+(s2A*mB*mB+2*(cA*cB*cC+1)*mA*mB+s2B*mA*mA))*mC+s2C*mA*mB*(mA+mB))*mABCinv;
float tab = ((cB*cC*mA-cA*mB-cA*mC)*vCA + (cA*cC*mB-cB*mC-cB*mA)*vBC + (s2C*mA*mA*mB*mB*mABCinv+(mA+mB+mC))*vAB)/denom; mixed tab = ((cB*cC*mA-cA*mB-cA*mC)*vCA + (cA*cC*mB-cB*mC-cB*mA)*vBC + (s2C*mA*mA*mB*mB*mABCinv+(mA+mB+mC))*vAB)/denom;
float tbc = ((cA*cB*mC-cC*mB-cC*mA)*vCA + (s2A*mB*mB*mC*mC*mABCinv+(mA+mB+mC))*vBC + (cA*cC*mB-cB*mA-cB*mC)*vAB)/denom; mixed tbc = ((cA*cB*mC-cC*mB-cC*mA)*vCA + (s2A*mB*mB*mC*mC*mABCinv+(mA+mB+mC))*vBC + (cA*cC*mB-cB*mA-cB*mC)*vAB)/denom;
float tca = ((s2B*mA*mA*mC*mC*mABCinv+(mA+mB+mC))*vCA + (cA*cB*mC-cC*mB-cC*mA)*vBC + (cB*cC*mA-cA*mB-cA*mC)*vAB)/denom; mixed tca = ((s2B*mA*mA*mC*mC*mABCinv+(mA+mB+mC))*vCA + (cA*cB*mC-cC*mB-cC*mA)*vBC + (cB*cC*mA-cA*mB-cA*mC)*vAB)/denom;
v0.xyz += (tab*eAB.xyz - tca*eCA.xyz)*v0.w; v0.xyz += (tab*eAB.xyz - tca*eCA.xyz)*v0.w;
v1.xyz += (tbc*eBC.xyz - tab*eAB.xyz)*v1.w; v1.xyz += (tbc*eBC.xyz - tab*eAB.xyz)*v1.w;
v2.xyz += (tca*eCA.xyz - tbc*eBC.xyz)*v2.w; v2.xyz += (tca*eCA.xyz - tbc*eBC.xyz)*v2.w;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment