Commit 9ad85ebd authored by Peter Eastman's avatar Peter Eastman
Browse files

Began implementing new mixed precision model that does integration in double...

Began implementing new mixed precision model that does integration in double precision and force evaluation in single precision
parent 7492dc48
...@@ -67,22 +67,22 @@ bool CudaContext::hasInitializedCuda = false; ...@@ -67,22 +67,22 @@ bool CudaContext::hasInitializedCuda = false;
CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& compiler, CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& compiler,
const string& tempDir, CudaPlatform::PlatformData& platformData) : system(system), compiler(compiler), const string& tempDir, CudaPlatform::PlatformData& platformData) : system(system), compiler(compiler),
time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), contextIsValid(false), atomsWereReordered(false), pinnedBuffer(NULL), posq(NULL), time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), contextIsValid(false), atomsWereReordered(false), pinnedBuffer(NULL), posq(NULL),
velm(NULL), force(NULL), energyBuffer(NULL), integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL), thread(NULL) { posqCorrection(NULL), velm(NULL), force(NULL), energyBuffer(NULL), integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL), thread(NULL) {
if (!hasInitializedCuda) { if (!hasInitializedCuda) {
CHECK_RESULT2(cuInit(0), "Error initializing CUDA"); CHECK_RESULT2(cuInit(0), "Error initializing CUDA");
hasInitializedCuda = true; hasInitializedCuda = true;
} }
if (precision == "single") { if (precision == "single") {
useDoublePrecision = false; useDoublePrecision = false;
accumulateInDouble = false; useMixedPrecision = false;
} }
else if (precision == "mixed") { else if (precision == "mixed") {
useDoublePrecision = false; useDoublePrecision = false;
accumulateInDouble = true; useMixedPrecision = true;
} }
else if (precision == "double") { else if (precision == "double") {
useDoublePrecision = true; useDoublePrecision = true;
accumulateInDouble = true; useMixedPrecision = false;
} }
else else
throw OpenMMException("Illegal value for CudaPrecision: "+precision); throw OpenMMException("Illegal value for CudaPrecision: "+precision);
...@@ -150,16 +150,37 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking ...@@ -150,16 +150,37 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
compilationDefines["make_real2"] = "make_double2"; compilationDefines["make_real2"] = "make_double2";
compilationDefines["make_real3"] = "make_double3"; compilationDefines["make_real3"] = "make_double3";
compilationDefines["make_real4"] = "make_double4"; compilationDefines["make_real4"] = "make_double4";
compilationDefines["make_mixed2"] = "make_double2";
compilationDefines["make_mixed3"] = "make_double3";
compilationDefines["make_mixed4"] = "make_double4";
energyBuffer = CudaArray::create<double>(*this, numEnergyBuffers, "energyBuffer"); energyBuffer = CudaArray::create<double>(*this, numEnergyBuffers, "energyBuffer");
int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers); int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers);
CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0)); CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0));
} }
else if (useMixedPrecision) {
posq = CudaArray::create<float4>(*this, paddedNumAtoms, "posq");
posqCorrection = CudaArray::create<float4>(*this, paddedNumAtoms, "posqCorrection");
velm = CudaArray::create<double4>(*this, paddedNumAtoms, "velm");
compilationDefines["USE_MIXED_PRECISION"] = "1";
compilationDefines["make_real2"] = "make_float2";
compilationDefines["make_real3"] = "make_float3";
compilationDefines["make_real4"] = "make_float4";
compilationDefines["make_mixed2"] = "make_double2";
compilationDefines["make_mixed3"] = "make_double3";
compilationDefines["make_mixed4"] = "make_double4";
energyBuffer = CudaArray::create<float>(*this, numEnergyBuffers, "energyBuffer");
int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers);
CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0));
}
else { else {
posq = CudaArray::create<float4>(*this, paddedNumAtoms, "posq"); posq = CudaArray::create<float4>(*this, paddedNumAtoms, "posq");
velm = CudaArray::create<float4>(*this, paddedNumAtoms, "velm"); velm = CudaArray::create<float4>(*this, paddedNumAtoms, "velm");
compilationDefines["make_real2"] = "make_float2"; compilationDefines["make_real2"] = "make_float2";
compilationDefines["make_real3"] = "make_float3"; compilationDefines["make_real3"] = "make_float3";
compilationDefines["make_real4"] = "make_float4"; compilationDefines["make_real4"] = "make_float4";
compilationDefines["make_mixed2"] = "make_float2";
compilationDefines["make_mixed3"] = "make_float3";
compilationDefines["make_mixed4"] = "make_float4";
energyBuffer = CudaArray::create<float>(*this, numEnergyBuffers, "energyBuffer"); energyBuffer = CudaArray::create<float>(*this, numEnergyBuffers, "energyBuffer");
int pinnedBufferSize = max(paddedNumAtoms*6, numEnergyBuffers); int pinnedBufferSize = max(paddedNumAtoms*6, numEnergyBuffers);
CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(float), 0)); CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(float), 0));
...@@ -211,6 +232,8 @@ CudaContext::~CudaContext() { ...@@ -211,6 +232,8 @@ CudaContext::~CudaContext() {
cuMemFreeHost(pinnedBuffer); cuMemFreeHost(pinnedBuffer);
if (posq != NULL) if (posq != NULL)
delete posq; delete posq;
if (posqCorrection != NULL)
delete posqCorrection;
if (velm != NULL) if (velm != NULL)
delete velm; delete velm;
if (force != NULL) if (force != NULL)
...@@ -237,7 +260,7 @@ void CudaContext::initialize() { ...@@ -237,7 +260,7 @@ void CudaContext::initialize() {
string errorMessage = "Error initializing Context"; string errorMessage = "Error initializing Context";
for (int i = 0; i < numAtoms; i++) { for (int i = 0; i < numAtoms; i++) {
double mass = system.getParticleMass(i); double mass = system.getParticleMass(i);
if (useDoublePrecision) if (useDoublePrecision || useMixedPrecision)
((double4*) pinnedBuffer)[i] = make_double4(0.0, 0.0, 0.0, mass == 0.0 ? 0.0 : 1.0/mass); ((double4*) pinnedBuffer)[i] = make_double4(0.0, 0.0, 0.0, mass == 0.0 ? 0.0 : 1.0/mass);
else else
((float4*) pinnedBuffer)[i] = make_float4(0.0f, 0.0f, 0.0f, mass == 0.0 ? 0.0f : (float) (1.0/mass)); ((float4*) pinnedBuffer)[i] = make_float4(0.0f, 0.0f, 0.0f, mass == 0.0 ? 0.0f : (float) (1.0/mass));
...@@ -308,6 +331,18 @@ CUmodule CudaContext::createModule(const string source, const map<string, string ...@@ -308,6 +331,18 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
src << "typedef float3 real3;\n"; src << "typedef float3 real3;\n";
src << "typedef float4 real4;\n"; src << "typedef float4 real4;\n";
} }
if (useDoublePrecision || useMixedPrecision) {
src << "typedef double mixed;\n";
src << "typedef double2 mixed2;\n";
src << "typedef double3 mixed3;\n";
src << "typedef double4 mixed4;\n";
}
else {
src << "typedef float mixed;\n";
src << "typedef float2 mixed2;\n";
src << "typedef float3 mixed3;\n";
src << "typedef float4 mixed4;\n";
}
for (map<string, string>::const_iterator iter = defines.begin(); iter != defines.end(); ++iter) { for (map<string, string>::const_iterator iter = defines.begin(); iter != defines.end(); ++iter) {
src << "#define " << iter->first; src << "#define " << iter->first;
if (!iter->second.empty()) if (!iter->second.empty())
...@@ -789,6 +824,22 @@ void CudaContext::validateMolecules() { ...@@ -789,6 +824,22 @@ void CudaContext::validateMolecules() {
posq->upload(newPosq); posq->upload(newPosq);
velm->upload(newVelm); velm->upload(newVelm);
} }
else if (useMixedPrecision) {
vector<float4> oldPosq(paddedNumAtoms);
vector<float4> newPosq(paddedNumAtoms);
vector<double4> oldVelm(paddedNumAtoms);
vector<double4> newVelm(paddedNumAtoms);
posq->download(oldPosq);
velm->download(oldVelm);
for (int i = 0; i < numAtoms; i++) {
int index = atomIndex[i];
newPosq[index] = oldPosq[i];
newVelm[index] = oldVelm[i];
newCellOffsets[index] = posCellOffsets[i];
}
posq->upload(newPosq);
velm->upload(newVelm);
}
else { else {
vector<float4> oldPosq(paddedNumAtoms); vector<float4> oldPosq(paddedNumAtoms);
vector<float4> newPosq(paddedNumAtoms); vector<float4> newPosq(paddedNumAtoms);
...@@ -822,19 +873,24 @@ void CudaContext::reorderAtoms(bool enforcePeriodic) { ...@@ -822,19 +873,24 @@ void CudaContext::reorderAtoms(bool enforcePeriodic) {
validateMolecules(); validateMolecules();
atomsWereReordered = true; atomsWereReordered = true;
if (useDoublePrecision) if (useDoublePrecision)
reorderAtomsImpl<double, double4>(enforcePeriodic); reorderAtomsImpl<double, double4, double, double4>(enforcePeriodic);
else if (useMixedPrecision)
reorderAtomsImpl<float, float4, double, double4>(enforcePeriodic);
else else
reorderAtomsImpl<float, float4>(enforcePeriodic); reorderAtomsImpl<float, float4, float, float4>(enforcePeriodic);
} }
template <class Real, class Real4> template <class Real, class Real4, class Mixed, class Mixed4>
void CudaContext::reorderAtomsImpl(bool enforcePeriodic) { void CudaContext::reorderAtomsImpl(bool enforcePeriodic) {
// Find the range of positions and the number of bins along each axis. // Find the range of positions and the number of bins along each axis.
vector<Real4> oldPosq(paddedNumAtoms); vector<Real4> oldPosq(paddedNumAtoms);
vector<Real4> oldVelm(paddedNumAtoms); vector<Real4> oldPosqCorrection(paddedNumAtoms);
vector<Mixed4> oldVelm(paddedNumAtoms);
posq->download(oldPosq); posq->download(oldPosq);
velm->download(oldVelm); velm->download(oldVelm);
if (useMixedPrecision)
posqCorrection->download(oldPosqCorrection);
Real minx = oldPosq[0].x, maxx = oldPosq[0].x; Real minx = oldPosq[0].x, maxx = oldPosq[0].x;
Real miny = oldPosq[0].y, maxy = oldPosq[0].y; Real miny = oldPosq[0].y, maxy = oldPosq[0].y;
Real minz = oldPosq[0].z, maxz = oldPosq[0].z; Real minz = oldPosq[0].z, maxz = oldPosq[0].z;
...@@ -860,7 +916,8 @@ void CudaContext::reorderAtomsImpl(bool enforcePeriodic) { ...@@ -860,7 +916,8 @@ void CudaContext::reorderAtomsImpl(bool enforcePeriodic) {
vector<int> originalIndex(numAtoms); vector<int> originalIndex(numAtoms);
vector<Real4> newPosq(paddedNumAtoms); vector<Real4> newPosq(paddedNumAtoms);
vector<Real4> newVelm(paddedNumAtoms); vector<Real4> newPosqCorrection(paddedNumAtoms);
vector<Mixed4> newVelm(paddedNumAtoms);
vector<int4> newCellOffsets(numAtoms); vector<int4> newCellOffsets(numAtoms);
for (int group = 0; group < (int) moleculeGroups.size(); group++) { for (int group = 0; group < (int) moleculeGroups.size(); group++) {
// Find the center of each molecule. // Find the center of each molecule.
...@@ -959,6 +1016,8 @@ void CudaContext::reorderAtomsImpl(bool enforcePeriodic) { ...@@ -959,6 +1016,8 @@ void CudaContext::reorderAtomsImpl(bool enforcePeriodic) {
int newIndex = mol.offsets[i]+atoms[j]; int newIndex = mol.offsets[i]+atoms[j];
originalIndex[newIndex] = atomIndex[oldIndex]; originalIndex[newIndex] = atomIndex[oldIndex];
newPosq[newIndex] = oldPosq[oldIndex]; newPosq[newIndex] = oldPosq[oldIndex];
if (useMixedPrecision)
newPosqCorrection[newIndex] = oldPosqCorrection[oldIndex];
newVelm[newIndex] = oldVelm[oldIndex]; newVelm[newIndex] = oldVelm[oldIndex];
newCellOffsets[newIndex] = posCellOffsets[oldIndex]; newCellOffsets[newIndex] = posCellOffsets[oldIndex];
} }
...@@ -972,6 +1031,8 @@ void CudaContext::reorderAtomsImpl(bool enforcePeriodic) { ...@@ -972,6 +1031,8 @@ void CudaContext::reorderAtomsImpl(bool enforcePeriodic) {
posCellOffsets[i] = newCellOffsets[i]; posCellOffsets[i] = newCellOffsets[i];
} }
posq->upload(newPosq); posq->upload(newPosq);
if (useMixedPrecision)
posqCorrection->upload(newPosqCorrection);
velm->upload(newVelm); velm->upload(newVelm);
atomIndexDevice->upload(atomIndex); atomIndexDevice->upload(atomIndex);
for (int i = 0; i < (int) reorderListeners.size(); i++) for (int i = 0; i < (int) reorderListeners.size(); i++)
......
...@@ -135,6 +135,12 @@ public: ...@@ -135,6 +135,12 @@ public:
CudaArray& getPosq() { CudaArray& getPosq() {
return *posq; return *posq;
} }
/**
* Get the array which contains a correction to the position of each atom. This only exists if getUseMixedPrecision() returns true.
*/
CudaArray& getPosqCorrection() {
return *posqCorrection;
}
/** /**
* Get the array which contains the velocity (the xyz components) and inverse mass (the w component) of each atom. * Get the array which contains the velocity (the xyz components) and inverse mass (the w component) of each atom.
*/ */
...@@ -314,10 +320,10 @@ public: ...@@ -314,10 +320,10 @@ public:
return useDoublePrecision; return useDoublePrecision;
} }
/** /**
* Get whether accumulation is being done in double precision. * Get whether mixed precision is being used.
*/ */
bool getAccumulateInDouble() { bool getUseMixedPrecision() {
return accumulateInDouble; return useMixedPrecision;
} }
/** /**
* Convert a number to a string in a format suitable for including in a kernel. * Convert a number to a string in a format suitable for including in a kernel.
...@@ -455,7 +461,7 @@ private: ...@@ -455,7 +461,7 @@ private:
/** /**
* This is the internal implementation of reorderAtoms(), templatized by the numerical precision in use. * This is the internal implementation of reorderAtoms(), templatized by the numerical precision in use.
*/ */
template <class Real, class Real4> template <class Real, class Real4, class Mixed, class Mixed4>
void reorderAtomsImpl(bool enforcePeriodic); void reorderAtomsImpl(bool enforcePeriodic);
static bool hasInitializedCuda; static bool hasInitializedCuda;
const System& system; const System& system;
...@@ -469,7 +475,7 @@ private: ...@@ -469,7 +475,7 @@ private:
int paddedNumAtoms; int paddedNumAtoms;
int numAtomBlocks; int numAtomBlocks;
int numThreadBlocks; int numThreadBlocks;
bool useBlockingSync, useDoublePrecision, accumulateInDouble, contextIsValid, atomsWereReordered, moleculesInvalid; bool useBlockingSync, useDoublePrecision, useMixedPrecision, contextIsValid, atomsWereReordered, moleculesInvalid;
std::string compiler, tempDir, gpuArchitecture; std::string compiler, tempDir, gpuArchitecture;
float4 periodicBoxSizeFloat, invPeriodicBoxSizeFloat; float4 periodicBoxSizeFloat, invPeriodicBoxSizeFloat;
double4 periodicBoxSize, invPeriodicBoxSize; double4 periodicBoxSize, invPeriodicBoxSize;
...@@ -489,6 +495,7 @@ private: ...@@ -489,6 +495,7 @@ private:
std::vector<int4> posCellOffsets; std::vector<int4> posCellOffsets;
void* pinnedBuffer; void* pinnedBuffer;
CudaArray* posq; CudaArray* posq;
CudaArray* posqCorrection;
CudaArray* velm; CudaArray* velm;
CudaArray* force; CudaArray* force;
CudaArray* energyBuffer; CudaArray* energyBuffer;
......
...@@ -104,7 +104,7 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S ...@@ -104,7 +104,7 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
vsiteOutOfPlaneAtoms(NULL), vsiteOutOfPlaneWeights(NULL) { vsiteOutOfPlaneAtoms(NULL), vsiteOutOfPlaneWeights(NULL) {
// Create workspace arrays. // Create workspace arrays.
if (context.getUseDoublePrecision()) { if (context.getUseDoublePrecision() || context.getUseMixedPrecision()) {
posDelta = CudaArray::create<double4>(context, context.getPaddedNumAtoms(), "posDelta"); posDelta = CudaArray::create<double4>(context, context.getPaddedNumAtoms(), "posDelta");
vector<double4> deltas(posDelta->getSize(), make_double4(0.0, 0.0, 0.0, 0.0)); vector<double4> deltas(posDelta->getSize(), make_double4(0.0, 0.0, 0.0, 0.0));
posDelta->upload(deltas); posDelta->upload(deltas);
...@@ -473,7 +473,7 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S ...@@ -473,7 +473,7 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
vector<int> atomConstraintsVec(ccmaAtomConstraints->getSize()); vector<int> atomConstraintsVec(ccmaAtomConstraints->getSize());
vector<int> numAtomConstraintsVec(ccmaNumAtomConstraints->getSize()); vector<int> numAtomConstraintsVec(ccmaNumAtomConstraints->getSize());
vector<int> constraintMatrixColumnVec(ccmaConstraintMatrixColumn->getSize()); vector<int> constraintMatrixColumnVec(ccmaConstraintMatrixColumn->getSize());
if (context.getUseDoublePrecision()) { if (context.getUseDoublePrecision() || context.getUseMixedPrecision()) {
ccmaDistance = CudaArray::create<double4>(context, numCCMA, "CcmaDistance"); ccmaDistance = CudaArray::create<double4>(context, numCCMA, "CcmaDistance");
ccmaDelta1 = CudaArray::create<double>(context, numCCMA, "CcmaDelta1"); ccmaDelta1 = CudaArray::create<double>(context, numCCMA, "CcmaDelta1");
ccmaDelta2 = CudaArray::create<double>(context, numCCMA, "CcmaDelta2"); ccmaDelta2 = CudaArray::create<double>(context, numCCMA, "CcmaDelta2");
...@@ -717,23 +717,24 @@ void CudaIntegrationUtilities::applyConstraints(bool constrainVelocities, double ...@@ -717,23 +717,24 @@ void CudaIntegrationUtilities::applyConstraints(bool constrainVelocities, double
ccmaForceKernel = ccmaPosForceKernel; ccmaForceKernel = ccmaPosForceKernel;
} }
float floatTol = (float) tol; float floatTol = (float) tol;
void* tolPointer = (context.getUseDoublePrecision() ? (void*) &tol : (void*) &floatTol); void* tolPointer = (context.getUseDoublePrecision() || context.getUseMixedPrecision() ? (void*) &tol : (void*) &floatTol);
CUdeviceptr posCorrection = (context.getUseMixedPrecision() ? context.getPosqCorrection().getDevicePointer() : 0);
if (settleAtoms != NULL) { if (settleAtoms != NULL) {
int numClusters = settleAtoms->getSize(); int numClusters = settleAtoms->getSize();
void* args[] = {&numClusters, tolPointer, &context.getPosq().getDevicePointer(), void* args[] = {&numClusters, tolPointer, &context.getPosq().getDevicePointer(), &posCorrection,
&posDelta->getDevicePointer(), &context.getVelm().getDevicePointer(), &posDelta->getDevicePointer(), &context.getVelm().getDevicePointer(),
&settleAtoms->getDevicePointer(), &settleParams->getDevicePointer()}; &settleAtoms->getDevicePointer(), &settleParams->getDevicePointer()};
context.executeKernel(settleKernel, args, settleAtoms->getSize()); context.executeKernel(settleKernel, args, settleAtoms->getSize());
} }
if (shakeAtoms != NULL) { if (shakeAtoms != NULL) {
int numClusters = shakeAtoms->getSize(); int numClusters = shakeAtoms->getSize();
void* args[] = {&numClusters, tolPointer, &context.getPosq().getDevicePointer(), void* args[] = {&numClusters, tolPointer, &context.getPosq().getDevicePointer(), &posCorrection,
constrainVelocities ? &context.getVelm().getDevicePointer() : &posDelta->getDevicePointer(), constrainVelocities ? &context.getVelm().getDevicePointer() : &posDelta->getDevicePointer(),
&shakeAtoms->getDevicePointer(), &shakeParams->getDevicePointer()}; &shakeAtoms->getDevicePointer(), &shakeParams->getDevicePointer()};
context.executeKernel(shakeKernel, args, shakeAtoms->getSize()); context.executeKernel(shakeKernel, args, shakeAtoms->getSize());
} }
if (ccmaAtoms != NULL) { if (ccmaAtoms != NULL) {
void* directionsArgs[] = {&ccmaAtoms->getDevicePointer(), &ccmaDistance->getDevicePointer(), &context.getPosq().getDevicePointer()}; void* directionsArgs[] = {&ccmaAtoms->getDevicePointer(), &ccmaDistance->getDevicePointer(), &context.getPosq().getDevicePointer(), &posCorrection};
context.executeKernel(ccmaDirectionsKernel, directionsArgs, ccmaAtoms->getSize()); context.executeKernel(ccmaDirectionsKernel, directionsArgs, ccmaAtoms->getSize());
int i; int i;
void* forceArgs[] = {&ccmaAtoms->getDevicePointer(), &ccmaDistance->getDevicePointer(), void* forceArgs[] = {&ccmaAtoms->getDevicePointer(), &ccmaDistance->getDevicePointer(),
...@@ -768,7 +769,8 @@ void CudaIntegrationUtilities::applyConstraints(bool constrainVelocities, double ...@@ -768,7 +769,8 @@ void CudaIntegrationUtilities::applyConstraints(bool constrainVelocities, double
void CudaIntegrationUtilities::computeVirtualSites() { void CudaIntegrationUtilities::computeVirtualSites() {
if (numVsites > 0) { if (numVsites > 0) {
void* args[] = {&context.getPosq().getDevicePointer(), &vsite2AvgAtoms->getDevicePointer(), &vsite2AvgWeights->getDevicePointer(), CUdeviceptr posCorrection = (context.getUseMixedPrecision() ? context.getPosqCorrection().getDevicePointer() : 0);
void* args[] = {&context.getPosq().getDevicePointer(), &posCorrection, &vsite2AvgAtoms->getDevicePointer(), &vsite2AvgWeights->getDevicePointer(),
&vsite3AvgAtoms->getDevicePointer(), &vsite3AvgWeights->getDevicePointer(), &vsite3AvgAtoms->getDevicePointer(), &vsite3AvgWeights->getDevicePointer(),
&vsiteOutOfPlaneAtoms->getDevicePointer(), &vsiteOutOfPlaneWeights->getDevicePointer()}; &vsiteOutOfPlaneAtoms->getDevicePointer(), &vsiteOutOfPlaneWeights->getDevicePointer()};
context.executeKernel(vsitePositionKernel, args, numVsites); context.executeKernel(vsitePositionKernel, args, numVsites);
...@@ -777,7 +779,8 @@ void CudaIntegrationUtilities::computeVirtualSites() { ...@@ -777,7 +779,8 @@ void CudaIntegrationUtilities::computeVirtualSites() {
void CudaIntegrationUtilities::distributeForcesFromVirtualSites() { void CudaIntegrationUtilities::distributeForcesFromVirtualSites() {
if (numVsites > 0) { if (numVsites > 0) {
void* args[] = {&context.getPosq().getDevicePointer(), &context.getForce().getDevicePointer(), CUdeviceptr posCorrection = (context.getUseMixedPrecision() ? context.getPosqCorrection().getDevicePointer() : 0);
void* args[] = {&context.getPosq().getDevicePointer(), &posCorrection, &context.getForce().getDevicePointer(),
&vsite2AvgAtoms->getDevicePointer(), &vsite2AvgWeights->getDevicePointer(), &vsite2AvgAtoms->getDevicePointer(), &vsite2AvgWeights->getDevicePointer(),
&vsite3AvgAtoms->getDevicePointer(), &vsite3AvgWeights->getDevicePointer(), &vsite3AvgAtoms->getDevicePointer(), &vsite3AvgWeights->getDevicePointer(),
&vsiteOutOfPlaneAtoms->getDevicePointer(), &vsiteOutOfPlaneWeights->getDevicePointer()}; &vsiteOutOfPlaneAtoms->getDevicePointer(), &vsiteOutOfPlaneWeights->getDevicePointer()};
......
...@@ -148,6 +148,18 @@ void CudaUpdateStateDataKernel::getPositions(ContextImpl& context, vector<Vec3>& ...@@ -148,6 +148,18 @@ void CudaUpdateStateDataKernel::getPositions(ContextImpl& context, vector<Vec3>&
positions[order[i]] = Vec3(pos.x-offset.x*periodicBoxSize.x, pos.y-offset.y*periodicBoxSize.y, pos.z-offset.z*periodicBoxSize.z); positions[order[i]] = Vec3(pos.x-offset.x*periodicBoxSize.x, pos.y-offset.y*periodicBoxSize.y, pos.z-offset.z*periodicBoxSize.z);
} }
} }
else if (cu.getUseMixedPrecision()) {
float4* posq = (float4*) cu.getPinnedBuffer();
vector<float4> posCorrection;
cu.getPosq().download(posq);
cu.getPosqCorrection().download(posCorrection);
for (int i = 0; i < numParticles; ++i) {
float4 pos1 = posq[i];
float4 pos2 = posCorrection[i];
int4 offset = cu.getPosCellOffsets()[i];
positions[order[i]] = Vec3((double)pos1.x+(double)pos2.x-offset.x*periodicBoxSize.x, (double)pos1.y+(double)pos2.y-offset.y*periodicBoxSize.y, (double)pos1.z+(double)pos2.z-offset.z*periodicBoxSize.z);
}
}
else { else {
float4* posq = (float4*) cu.getPinnedBuffer(); float4* posq = (float4*) cu.getPinnedBuffer();
cu.getPosq().download(posq); cu.getPosq().download(posq);
...@@ -183,14 +195,28 @@ void CudaUpdateStateDataKernel::setPositions(ContextImpl& context, const vector< ...@@ -183,14 +195,28 @@ void CudaUpdateStateDataKernel::setPositions(ContextImpl& context, const vector<
for (int i = 0; i < numParticles; ++i) { for (int i = 0; i < numParticles; ++i) {
float4& pos = posq[i]; float4& pos = posq[i];
const Vec3& p = positions[order[i]]; const Vec3& p = positions[order[i]];
pos.x = p[0]; pos.x = (float) p[0];
pos.y = p[1]; pos.y = (float) p[1];
pos.z = p[2]; pos.z = (float) p[2];
} }
for (int i = numParticles; i < cu.getPaddedNumAtoms(); i++) for (int i = numParticles; i < cu.getPaddedNumAtoms(); i++)
posq[i] = make_float4(0.0, 0.0, 0.0, 0.0); posq[i] = make_float4(0.0, 0.0, 0.0, 0.0);
cu.getPosq().upload(posq); cu.getPosq().upload(posq);
} }
if (cu.getUseMixedPrecision()) {
float4* posCorrection = (float4*) cu.getPinnedBuffer();
for (int i = 0; i < numParticles; ++i) {
float4& c = posCorrection[i];
const Vec3& p = positions[order[i]];
c.x = (float) (p[0]-(float)p[0]);
c.y = (float) (p[1]-(float)p[1]);
c.z = (float) (p[2]-(float)p[2]);
c.w = 0;
}
for (int i = numParticles; i < cu.getPaddedNumAtoms(); i++)
posCorrection[i] = make_float4(0.0, 0.0, 0.0, 0.0);
cu.getPosqCorrection().upload(posCorrection);
}
for (int i = 0; i < (int) cu.getPosCellOffsets().size(); i++) for (int i = 0; i < (int) cu.getPosCellOffsets().size(); i++)
cu.getPosCellOffsets()[i] = make_int4(0, 0, 0, 0); cu.getPosCellOffsets()[i] = make_int4(0, 0, 0, 0);
} }
...@@ -200,7 +226,7 @@ void CudaUpdateStateDataKernel::getVelocities(ContextImpl& context, vector<Vec3> ...@@ -200,7 +226,7 @@ void CudaUpdateStateDataKernel::getVelocities(ContextImpl& context, vector<Vec3>
const vector<int>& order = cu.getAtomIndex(); const vector<int>& order = cu.getAtomIndex();
int numParticles = context.getSystem().getNumParticles(); int numParticles = context.getSystem().getNumParticles();
velocities.resize(numParticles); velocities.resize(numParticles);
if (cu.getUseDoublePrecision()) { if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
double4* velm = (double4*) cu.getPinnedBuffer(); double4* velm = (double4*) cu.getPinnedBuffer();
cu.getVelm().download(velm); cu.getVelm().download(velm);
for (int i = 0; i < numParticles; ++i) { for (int i = 0; i < numParticles; ++i) {
...@@ -224,7 +250,7 @@ void CudaUpdateStateDataKernel::setVelocities(ContextImpl& context, const vector ...@@ -224,7 +250,7 @@ void CudaUpdateStateDataKernel::setVelocities(ContextImpl& context, const vector
cu.setAsCurrent(); cu.setAsCurrent();
const vector<int>& order = cu.getAtomIndex(); const vector<int>& order = cu.getAtomIndex();
int numParticles = context.getSystem().getNumParticles(); int numParticles = context.getSystem().getNumParticles();
if (cu.getUseDoublePrecision()) { if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
double4* velm = (double4*) cu.getPinnedBuffer(); double4* velm = (double4*) cu.getPinnedBuffer();
cu.getVelm().download(velm); cu.getVelm().download(velm);
for (int i = 0; i < numParticles; ++i) { for (int i = 0; i < numParticles; ++i) {
...@@ -290,12 +316,11 @@ void CudaUpdateStateDataKernel::createCheckpoint(ContextImpl& context, ostream& ...@@ -290,12 +316,11 @@ void CudaUpdateStateDataKernel::createCheckpoint(ContextImpl& context, ostream&
stream.write((char*) &stepCount, sizeof(int)); stream.write((char*) &stepCount, sizeof(int));
int computeForceCount = cu.getComputeForceCount(); int computeForceCount = cu.getComputeForceCount();
stream.write((char*) &computeForceCount, sizeof(int)); stream.write((char*) &computeForceCount, sizeof(int));
int bufferSize = cu.getPaddedNumAtoms()*(cu.getUseDoublePrecision() ? sizeof(double4) : sizeof(float4));
char* buffer = (char*) cu.getPinnedBuffer(); char* buffer = (char*) cu.getPinnedBuffer();
cu.getPosq().download(buffer); cu.getPosq().download(buffer);
stream.write(buffer, bufferSize); stream.write(buffer, cu.getPosq().getSize()*cu.getPosq().getElementSize());
cu.getVelm().download(buffer); cu.getVelm().download(buffer);
stream.write(buffer, bufferSize); stream.write(buffer, cu.getVelm().getSize()*cu.getVelm().getElementSize());
stream.write((char*) &cu.getAtomIndex()[0], sizeof(int)*cu.getAtomIndex().size()); stream.write((char*) &cu.getAtomIndex()[0], sizeof(int)*cu.getAtomIndex().size());
stream.write((char*) &cu.getPosCellOffsets()[0], sizeof(int4)*cu.getPosCellOffsets().size()); stream.write((char*) &cu.getPosCellOffsets()[0], sizeof(int4)*cu.getPosCellOffsets().size());
double4 box = cu.getPeriodicBoxSize(); double4 box = cu.getPeriodicBoxSize();
...@@ -321,11 +346,10 @@ void CudaUpdateStateDataKernel::loadCheckpoint(ContextImpl& context, istream& st ...@@ -321,11 +346,10 @@ void CudaUpdateStateDataKernel::loadCheckpoint(ContextImpl& context, istream& st
contexts[i]->setStepCount(stepCount); contexts[i]->setStepCount(stepCount);
contexts[i]->setComputeForceCount(computeForceCount); contexts[i]->setComputeForceCount(computeForceCount);
} }
int bufferSize = cu.getPaddedNumAtoms()*(cu.getUseDoublePrecision() ? sizeof(double4) : sizeof(float4));
char* buffer = (char*) cu.getPinnedBuffer(); char* buffer = (char*) cu.getPinnedBuffer();
stream.read(buffer, bufferSize); stream.read(buffer, cu.getPosq().getSize()*cu.getPosq().getElementSize());
cu.getPosq().upload(buffer); cu.getPosq().upload(buffer);
stream.read(buffer, bufferSize); stream.read(buffer, cu.getVelm().getSize()*cu.getVelm().getElementSize());
cu.getVelm().upload(buffer); cu.getVelm().upload(buffer);
stream.read((char*) &cu.getAtomIndex()[0], sizeof(int)*cu.getAtomIndex().size()); stream.read((char*) &cu.getAtomIndex()[0], sizeof(int)*cu.getAtomIndex().size());
cu.getAtomIndexArray().upload(cu.getAtomIndex()); cu.getAtomIndexArray().upload(cu.getAtomIndex());
...@@ -2016,7 +2040,7 @@ double CudaCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeFor ...@@ -2016,7 +2040,7 @@ double CudaCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeFor
defines["FORCE_WORK_GROUP_SIZE"] = cu.intToString(nb.getForceThreadBlockSize()); defines["FORCE_WORK_GROUP_SIZE"] = cu.intToString(nb.getForceThreadBlockSize());
map<string, string> replacements; map<string, string> replacements;
stringstream defineAccum; stringstream defineAccum;
if (cu.getAccumulateInDouble()) { if (cu.getUseMixedPrecision()) {
defineAccum << "typedef double accum;\n"; defineAccum << "typedef double accum;\n";
defineAccum << "typedef double4 accum4;\n"; defineAccum << "typedef double4 accum4;\n";
defines["make_accum4"] = "make_double4"; defines["make_accum4"] = "make_double4";
...@@ -2531,7 +2555,7 @@ void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomG ...@@ -2531,7 +2555,7 @@ void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomG
replacements["STORE_DERIVATIVES_2"] = storeDerivs2.str(); replacements["STORE_DERIVATIVES_2"] = storeDerivs2.str();
map<string, string> defines; map<string, string> defines;
stringstream defineAccum; stringstream defineAccum;
if (cu.getAccumulateInDouble()) { if (cu.getUseMixedPrecision()) {
defineAccum << "typedef double accum;\n"; defineAccum << "typedef double accum;\n";
defineAccum << "typedef double3 accum3;\n"; defineAccum << "typedef double3 accum3;\n";
defines["make_accum3"] = "make_double3"; defines["make_accum3"] = "make_double3";
...@@ -3971,7 +3995,7 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn ...@@ -3971,7 +3995,7 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
int numAtoms = cu.getNumAtoms(); int numAtoms = cu.getNumAtoms();
double dt = integrator.getStepSize(); double dt = integrator.getStepSize();
if (dt != prevStepSize) { if (dt != prevStepSize) {
if (cu.getUseDoublePrecision()) { if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
vector<double2> stepSizeVec(1); vector<double2> stepSizeVec(1);
stepSizeVec[0] = make_double2(dt, dt); stepSizeVec[0] = make_double2(dt, dt);
cu.getIntegrationUtilities().getStepSize().upload(stepSizeVec); cu.getIntegrationUtilities().getStepSize().upload(stepSizeVec);
...@@ -3986,7 +4010,8 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn ...@@ -3986,7 +4010,8 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
// Call the first integration kernel. // Call the first integration kernel.
void* args1[] = {&cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(), CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
void* args1[] = {&cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(), &posCorrection,
&cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer()}; &cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer()};
cu.executeKernel(kernel1, args1, numAtoms); cu.executeKernel(kernel1, args1, numAtoms);
...@@ -3996,7 +4021,7 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn ...@@ -3996,7 +4021,7 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
// Call the second integration kernel. // Call the second integration kernel.
void* args2[] = {&cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(), void* args2[] = {&cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(), &posCorrection,
&cu.getVelm().getDevicePointer(), &integration.getPosDelta().getDevicePointer()}; &cu.getVelm().getDevicePointer(), &integration.getPosDelta().getDevicePointer()};
cu.executeKernel(kernel2, args2, numAtoms); cu.executeKernel(kernel2, args2, numAtoms);
integration.computeVirtualSites(); integration.computeVirtualSites();
...@@ -4023,7 +4048,7 @@ void CudaIntegrateLangevinStepKernel::initialize(const System& system, const Lan ...@@ -4023,7 +4048,7 @@ void CudaIntegrateLangevinStepKernel::initialize(const System& system, const Lan
CUmodule module = cu.createModule(CudaKernelSources::langevin, defines, ""); CUmodule module = cu.createModule(CudaKernelSources::langevin, defines, "");
kernel1 = cu.getKernel(module, "integrateLangevinPart1"); kernel1 = cu.getKernel(module, "integrateLangevinPart1");
kernel2 = cu.getKernel(module, "integrateLangevinPart2"); kernel2 = cu.getKernel(module, "integrateLangevinPart2");
params = new CudaArray(cu, 3, cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float), "langevinParams"); params = new CudaArray(cu, 3, cu.getUseDoublePrecision() || cu.getUseMixedPrecision() ? sizeof(double) : sizeof(float), "langevinParams");
prevStepSize = -1.0; prevStepSize = -1.0;
} }
...@@ -4042,7 +4067,7 @@ void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const Langev ...@@ -4042,7 +4067,7 @@ void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const Langev
double vscale = exp(-stepSize/tau); double vscale = exp(-stepSize/tau);
double fscale = (1-vscale)*tau; double fscale = (1-vscale)*tau;
double noisescale = sqrt(2*kT/tau)*sqrt(0.5*(1-vscale*vscale)*tau); double noisescale = sqrt(2*kT/tau)*sqrt(0.5*(1-vscale*vscale)*tau);
if (cu.getUseDoublePrecision()) { if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
vector<double> p(params->getSize()); vector<double> p(params->getSize());
p[0] = vscale; p[0] = vscale;
p[1] = fscale; p[1] = fscale;
...@@ -4078,7 +4103,8 @@ void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const Langev ...@@ -4078,7 +4103,8 @@ void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const Langev
// Call the second integration kernel. // Call the second integration kernel.
void* args2[] = {&cu.getPosq().getDevicePointer(), &integration.getPosDelta().getDevicePointer(), CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
void* args2[] = {&cu.getPosq().getDevicePointer(), &posCorrection, &integration.getPosDelta().getDevicePointer(),
&cu.getVelm().getDevicePointer(), &integration.getStepSize().getDevicePointer()}; &cu.getVelm().getDevicePointer(), &integration.getStepSize().getDevicePointer()};
cu.executeKernel(kernel2, args2, numAtoms); cu.executeKernel(kernel2, args2, numAtoms);
integration.computeVirtualSites(); integration.computeVirtualSites();
...@@ -4172,16 +4198,18 @@ double CudaIntegrateVariableVerletStepKernel::execute(ContextImpl& context, cons ...@@ -4172,16 +4198,18 @@ double CudaIntegrateVariableVerletStepKernel::execute(ContextImpl& context, cons
float maxStepSizeFloat = (float) maxStepSize; float maxStepSizeFloat = (float) maxStepSize;
double tol = integrator.getErrorTolerance(); double tol = integrator.getErrorTolerance();
float tolFloat = (float) tol; float tolFloat = (float) tol;
void* argsSelect[] = {cu.getUseDoublePrecision() ? (void*) &maxStepSize : (void*) &maxStepSizeFloat, bool useDouble = cu.getUseDoublePrecision() || cu.getUseMixedPrecision();
cu.getUseDoublePrecision() ? (void*) &tol : (void*) &tolFloat, void* argsSelect[] = {useDouble ? (void*) &maxStepSize : (void*) &maxStepSizeFloat,
useDouble ? (void*) &tol : (void*) &tolFloat,
&cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getIntegrationUtilities().getStepSize().getDevicePointer(),
&cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer()}; &cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer()};
int sharedSize = blockSize*(cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float)); int sharedSize = blockSize*(useDouble ? sizeof(double) : sizeof(float));
cu.executeKernel(selectSizeKernel, argsSelect, blockSize, blockSize, sharedSize); cu.executeKernel(selectSizeKernel, argsSelect, blockSize, blockSize, sharedSize);
// Call the first integration kernel. // Call the first integration kernel.
void* args1[] = {&cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(), CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
void* args1[] = {&cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(), &posCorrection,
&cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer()}; &cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer()};
cu.executeKernel(kernel1, args1, numAtoms); cu.executeKernel(kernel1, args1, numAtoms);
...@@ -4191,7 +4219,7 @@ double CudaIntegrateVariableVerletStepKernel::execute(ContextImpl& context, cons ...@@ -4191,7 +4219,7 @@ double CudaIntegrateVariableVerletStepKernel::execute(ContextImpl& context, cons
// Call the second integration kernel. // Call the second integration kernel.
void* args2[] = {&cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(), void* args2[] = {&cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(), &posCorrection,
&cu.getVelm().getDevicePointer(), &integration.getPosDelta().getDevicePointer()}; &cu.getVelm().getDevicePointer(), &integration.getPosDelta().getDevicePointer()};
cu.executeKernel(kernel2, args2, numAtoms); cu.executeKernel(kernel2, args2, numAtoms);
integration.computeVirtualSites(); integration.computeVirtualSites();
...@@ -4199,7 +4227,7 @@ double CudaIntegrateVariableVerletStepKernel::execute(ContextImpl& context, cons ...@@ -4199,7 +4227,7 @@ double CudaIntegrateVariableVerletStepKernel::execute(ContextImpl& context, cons
// Update the time and step count. // Update the time and step count.
double dt, time; double dt, time;
if (cu.getUseDoublePrecision()) { if (useDouble) {
double2 stepSize; double2 stepSize;
cu.getIntegrationUtilities().getStepSize().download(&stepSize); cu.getIntegrationUtilities().getStepSize().download(&stepSize);
dt = stepSize.y; dt = stepSize.y;
...@@ -4237,7 +4265,7 @@ void CudaIntegrateVariableLangevinStepKernel::initialize(const System& system, c ...@@ -4237,7 +4265,7 @@ void CudaIntegrateVariableLangevinStepKernel::initialize(const System& system, c
kernel1 = cu.getKernel(module, "integrateLangevinPart1"); kernel1 = cu.getKernel(module, "integrateLangevinPart1");
kernel2 = cu.getKernel(module, "integrateLangevinPart2"); kernel2 = cu.getKernel(module, "integrateLangevinPart2");
selectSizeKernel = cu.getKernel(module, "selectLangevinStepSize"); selectSizeKernel = cu.getKernel(module, "selectLangevinStepSize");
params = CudaArray::create<float>(cu, 3, "langevinParams"); params = new CudaArray(cu, 3, cu.getUseDoublePrecision() || cu.getUseMixedPrecision() ? sizeof(double) : sizeof(float), "langevinParams");
blockSize = min(256, system.getNumParticles()); blockSize = min(256, system.getNumParticles());
blockSize = max(blockSize, params->getSize()); blockSize = max(blockSize, params->getSize());
} }
...@@ -4257,13 +4285,14 @@ double CudaIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, co ...@@ -4257,13 +4285,14 @@ double CudaIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, co
float tauFloat = (float) tau; float tauFloat = (float) tau;
double kT = BOLTZ*integrator.getTemperature(); double kT = BOLTZ*integrator.getTemperature();
float kTFloat = (float) kT; float kTFloat = (float) kT;
void* argsSelect[] = {cu.getUseDoublePrecision() ? (void*) &maxStepSize : (void*) &maxStepSizeFloat, bool useDouble = cu.getUseDoublePrecision() || cu.getUseMixedPrecision();
cu.getUseDoublePrecision() ? (void*) &tol : (void*) &tolFloat, void* argsSelect[] = {useDouble ? (void*) &maxStepSize : (void*) &maxStepSizeFloat,
cu.getUseDoublePrecision() ? (void*) &tau : (void*) &tauFloat, useDouble ? (void*) &tol : (void*) &tolFloat,
cu.getUseDoublePrecision() ? (void*) &kT : (void*) &kTFloat, useDouble ? (void*) &tau : (void*) &tauFloat,
useDouble ? (void*) &kT : (void*) &kTFloat,
&cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getIntegrationUtilities().getStepSize().getDevicePointer(),
&cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &params->getDevicePointer()}; &cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &params->getDevicePointer()};
int sharedSize = blockSize*(cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float)); int sharedSize = blockSize*(useDouble ? sizeof(double) : sizeof(float));
cu.executeKernel(selectSizeKernel, argsSelect, blockSize, blockSize, sharedSize); cu.executeKernel(selectSizeKernel, argsSelect, blockSize, blockSize, sharedSize);
// Call the first integration kernel. // Call the first integration kernel.
...@@ -4279,7 +4308,8 @@ double CudaIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, co ...@@ -4279,7 +4308,8 @@ double CudaIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, co
// Call the second integration kernel. // Call the second integration kernel.
void* args2[] = {&cu.getPosq().getDevicePointer(), &integration.getPosDelta().getDevicePointer(), CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
void* args2[] = {&cu.getPosq().getDevicePointer(), &posCorrection, &integration.getPosDelta().getDevicePointer(),
&cu.getVelm().getDevicePointer(), &integration.getStepSize().getDevicePointer()}; &cu.getVelm().getDevicePointer(), &integration.getStepSize().getDevicePointer()};
cu.executeKernel(kernel2, args2, numAtoms); cu.executeKernel(kernel2, args2, numAtoms);
integration.computeVirtualSites(); integration.computeVirtualSites();
...@@ -4287,7 +4317,7 @@ double CudaIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, co ...@@ -4287,7 +4317,7 @@ double CudaIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, co
// Update the time and step count. // Update the time and step count.
double dt, time; double dt, time;
if (cu.getUseDoublePrecision()) { if (useDouble) {
double2 stepSize; double2 stepSize;
cu.getIntegrationUtilities().getStepSize().download(&stepSize); cu.getIntegrationUtilities().getStepSize().download(&stepSize);
dt = stepSize.y; dt = stepSize.y;
...@@ -5099,7 +5129,7 @@ double CudaCalcKineticEnergyKernel::execute(ContextImpl& context) { ...@@ -5099,7 +5129,7 @@ double CudaCalcKineticEnergyKernel::execute(ContextImpl& context) {
const vector<int>& order = cu.getAtomIndex(); const vector<int>& order = cu.getAtomIndex();
double energy = 0.0; double energy = 0.0;
if (cu.getUseDoublePrecision()) { if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
double4* velm = (double4*) cu.getPinnedBuffer(); double4* velm = (double4*) cu.getPinnedBuffer();
cu.getVelm().download(velm); cu.getVelm().download(velm);
for (size_t i = 0; i < masses.size(); ++i) { for (size_t i = 0; i < masses.size(); ++i) {
......
...@@ -446,12 +446,12 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source, ...@@ -446,12 +446,12 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
defines["NUM_ATOMS"] = context.intToString(context.getNumAtoms()); defines["NUM_ATOMS"] = context.intToString(context.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = context.intToString(context.getPaddedNumAtoms()); defines["PADDED_NUM_ATOMS"] = context.intToString(context.getPaddedNumAtoms());
defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks()); defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks());
if ((localDataSize/4)%2 == 0 && !context.getUseDoublePrecision() && !context.getAccumulateInDouble()) if ((localDataSize/4)%2 == 0 && !context.getUseDoublePrecision() && !context.getUseMixedPrecision())
defines["PARAMETER_SIZE_IS_EVEN"] = "1"; defines["PARAMETER_SIZE_IS_EVEN"] = "1";
if (context.getComputeCapability() >= 3.0 && !context.getUseDoublePrecision()) if (context.getComputeCapability() >= 3.0 && !context.getUseDoublePrecision())
defines["ENABLE_SHUFFLE"] = "1"; defines["ENABLE_SHUFFLE"] = "1";
stringstream defineAccum; stringstream defineAccum;
if (context.getAccumulateInDouble()) { if (context.getUseMixedPrecision()) {
defineAccum << "typedef double accum;\n"; defineAccum << "typedef double accum;\n";
defineAccum << "typedef double3 accum3;\n"; defineAccum << "typedef double3 accum3;\n";
defines["make_accum3"] = "make_double3"; defines["make_accum3"] = "make_double3";
......
...@@ -2,12 +2,12 @@ ...@@ -2,12 +2,12 @@
* Apply the Andersen thermostat to adjust particle velocities. * Apply the Andersen thermostat to adjust particle velocities.
*/ */
extern "C" __global__ void applyAndersenThermostat(float collisionFrequency, float kT, real4* velm, const real2* __restrict__ stepSize, const float4* __restrict__ random, extern "C" __global__ void applyAndersenThermostat(float collisionFrequency, float kT, mixed4* velm, const mixed4* __restrict__ stepSize, const float4* __restrict__ random,
unsigned int randomIndex, const int* __restrict__ atomGroups) { unsigned int randomIndex, const int* __restrict__ atomGroups) {
float collisionProbability = 1.0f-expf(-collisionFrequency*stepSize[0].y); float collisionProbability = 1.0f-expf(-(float) (collisionFrequency*stepSize[0].y));
float randomRange = erff(collisionProbability/sqrtf(2.0f)); float randomRange = erff(collisionProbability/sqrtf(2.0f));
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) { for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
real4 velocity = velm[index]; mixed4 velocity = velm[index];
float4 selectRand = random[randomIndex+atomGroups[index]]; float4 selectRand = random[randomIndex+atomGroups[index]];
float4 velRand = random[randomIndex+index]; float4 velRand = random[randomIndex+index];
real scale = (selectRand.w > -randomRange && selectRand.w < randomRange ? 0 : 1); real scale = (selectRand.w > -randomRange && selectRand.w < randomRange ? 0 : 1);
......
...@@ -4,23 +4,23 @@ enum {VelScale, ForceScale, NoiseScale, MaxParams}; ...@@ -4,23 +4,23 @@ enum {VelScale, ForceScale, NoiseScale, MaxParams};
* Perform the first step of Langevin integration. * Perform the first step of Langevin integration.
*/ */
extern "C" __global__ void integrateLangevinPart1(real4* __restrict__ velm, const long long* __restrict__ force, real4* __restrict__ posDelta, extern "C" __global__ void integrateLangevinPart1(mixed4* __restrict__ velm, const long long* __restrict__ force, mixed4* __restrict__ posDelta,
const real* __restrict__ paramBuffer, const real2* __restrict__ dt, const float4* __restrict__ random, unsigned int randomIndex) { const mixed* __restrict__ paramBuffer, const mixed2* __restrict__ dt, const float4* __restrict__ random, unsigned int randomIndex) {
real vscale = paramBuffer[VelScale]; mixed vscale = paramBuffer[VelScale];
real fscale = paramBuffer[ForceScale]/(real) 0xFFFFFFFF; mixed fscale = paramBuffer[ForceScale]/(mixed) 0xFFFFFFFF;
real noisescale = paramBuffer[NoiseScale]; mixed noisescale = paramBuffer[NoiseScale];
real stepSize = dt[0].y; mixed stepSize = dt[0].y;
int index = blockIdx.x*blockDim.x+threadIdx.x; int index = blockIdx.x*blockDim.x+threadIdx.x;
randomIndex += index; randomIndex += index;
while (index < NUM_ATOMS) { while (index < NUM_ATOMS) {
real4 velocity = velm[index]; mixed4 velocity = velm[index];
if (velocity.w != 0) { if (velocity.w != 0) {
real sqrtInvMass = SQRT(velocity.w); mixed sqrtInvMass = SQRT(velocity.w);
velocity.x = vscale*velocity.x + fscale*velocity.w*force[index] + noisescale*sqrtInvMass*random[randomIndex].x; velocity.x = vscale*velocity.x + fscale*velocity.w*force[index] + noisescale*sqrtInvMass*random[randomIndex].x;
velocity.y = vscale*velocity.y + fscale*velocity.w*force[index+PADDED_NUM_ATOMS] + noisescale*sqrtInvMass*random[randomIndex].y; velocity.y = vscale*velocity.y + fscale*velocity.w*force[index+PADDED_NUM_ATOMS] + noisescale*sqrtInvMass*random[randomIndex].y;
velocity.z = vscale*velocity.z + fscale*velocity.w*force[index+PADDED_NUM_ATOMS*2] + noisescale*sqrtInvMass*random[randomIndex].z; velocity.z = vscale*velocity.z + fscale*velocity.w*force[index+PADDED_NUM_ATOMS*2] + noisescale*sqrtInvMass*random[randomIndex].z;
velm[index] = velocity; velm[index] = velocity;
posDelta[index] = make_real4(stepSize*velocity.x, stepSize*velocity.y, stepSize*velocity.z, 0); posDelta[index] = make_mixed4(stepSize*velocity.x, stepSize*velocity.y, stepSize*velocity.z, 0);
} }
randomIndex += blockDim.x*gridDim.x; randomIndex += blockDim.x*gridDim.x;
index += blockDim.x*gridDim.x; index += blockDim.x*gridDim.x;
...@@ -31,21 +31,32 @@ extern "C" __global__ void integrateLangevinPart1(real4* __restrict__ velm, cons ...@@ -31,21 +31,32 @@ extern "C" __global__ void integrateLangevinPart1(real4* __restrict__ velm, cons
* Perform the second step of Langevin integration. * Perform the second step of Langevin integration.
*/ */
extern "C" __global__ void integrateLangevinPart2(real4* __restrict__ posq, const real4* __restrict__ posDelta, real4* __restrict__ velm, const real2* __restrict__ dt) { extern "C" __global__ void integrateLangevinPart2(real4* __restrict__ posq, real4* __restrict__ posqCorrection, const mixed4* __restrict__ posDelta, mixed4* __restrict__ velm, const mixed2* __restrict__ dt) {
double invStepSize = 1.0/dt[0].y; double invStepSize = 1.0/dt[0].y;
int index = blockIdx.x*blockDim.x+threadIdx.x; int index = blockIdx.x*blockDim.x+threadIdx.x;
while (index < NUM_ATOMS) { while (index < NUM_ATOMS) {
real4 vel = velm[index]; mixed4 vel = velm[index];
if (vel.w != 0) { if (vel.w != 0) {
#ifdef USE_MIXED_PRECISION
real4 pos1 = posq[index];
real4 pos2 = posqCorrection[index];
mixed4 pos = make_mixed4(pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
#else
real4 pos = posq[index]; real4 pos = posq[index];
real4 delta = posDelta[index]; #endif
mixed4 delta = posDelta[index];
pos.x += delta.x; pos.x += delta.x;
pos.y += delta.y; pos.y += delta.y;
pos.z += delta.z; pos.z += delta.z;
vel.x = (real) invStepSize*delta.x; vel.x = (mixed) (invStepSize*delta.x);
vel.y = (real) invStepSize*delta.y; vel.y = (mixed) (invStepSize*delta.y);
vel.z = (real) invStepSize*delta.z; vel.z = (mixed) (invStepSize*delta.z);
#ifdef USE_MIXED_PRECISION
posq[index] = make_real4((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
posqCorrection[index] = make_real4(pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
#else
posq[index] = pos; posq[index] = pos;
#endif
velm[index] = vel; velm[index] = vel;
} }
index += blockDim.x*gridDim.x; index += blockDim.x*gridDim.x;
...@@ -56,18 +67,18 @@ extern "C" __global__ void integrateLangevinPart2(real4* __restrict__ posq, cons ...@@ -56,18 +67,18 @@ extern "C" __global__ void integrateLangevinPart2(real4* __restrict__ posq, cons
* Select the step size to use for the next step. * Select the step size to use for the next step.
*/ */
extern "C" __global__ void selectLangevinStepSize(real maxStepSize, real errorTol, real tau, real kT, real2* __restrict__ dt, extern "C" __global__ void selectLangevinStepSize(mixed maxStepSize, mixed errorTol, mixed tau, mixed kT, mixed2* __restrict__ dt,
const real4* __restrict__ velm, const long long* __restrict__ force, real* __restrict__ paramBuffer) { const mixed4* __restrict__ velm, const long long* __restrict__ force, mixed* __restrict__ paramBuffer) {
// Calculate the error. // Calculate the error.
extern __shared__ real params[]; extern __shared__ mixed params[];
real* error = &params[MaxParams]; mixed* error = &params[MaxParams];
real err = 0; mixed err = 0;
unsigned int index = threadIdx.x; unsigned int index = threadIdx.x;
const real scale = RECIP((real) 0xFFFFFFFF); const mixed scale = RECIP((mixed) 0xFFFFFFFF);
while (index < NUM_ATOMS) { while (index < NUM_ATOMS) {
real3 f = make_real3(scale*force[index], scale*force[index+PADDED_NUM_ATOMS], scale*force[index+PADDED_NUM_ATOMS*2]); mixed3 f = make_mixed3(scale*force[index], scale*force[index+PADDED_NUM_ATOMS], scale*force[index+PADDED_NUM_ATOMS*2]);
real invMass = velm[index].w; mixed invMass = velm[index].w;
err += (f.x*f.x + f.y*f.y + f.z*f.z)*invMass; err += (f.x*f.x + f.y*f.y + f.z*f.z)*invMass;
index += blockDim.x*gridDim.x; index += blockDim.x*gridDim.x;
} }
...@@ -84,9 +95,9 @@ extern "C" __global__ void selectLangevinStepSize(real maxStepSize, real errorTo ...@@ -84,9 +95,9 @@ extern "C" __global__ void selectLangevinStepSize(real maxStepSize, real errorTo
if (blockIdx.x*blockDim.x+threadIdx.x == 0) { if (blockIdx.x*blockDim.x+threadIdx.x == 0) {
// Select the new step size. // Select the new step size.
real totalError = sqrt(error[0]/(NUM_ATOMS*3)); mixed totalError = sqrt(error[0]/(NUM_ATOMS*3));
real newStepSize = sqrt(errorTol/totalError); mixed newStepSize = sqrt(errorTol/totalError);
real oldStepSize = dt[0].y; mixed oldStepSize = dt[0].y;
if (oldStepSize > 0.0f) if (oldStepSize > 0.0f)
newStepSize = min(newStepSize, oldStepSize*2.0f); // For safety, limit how quickly dt can increase. newStepSize = min(newStepSize, oldStepSize*2.0f); // For safety, limit how quickly dt can increase.
if (newStepSize > oldStepSize && newStepSize < 1.1f*oldStepSize) if (newStepSize > oldStepSize && newStepSize < 1.1f*oldStepSize)
...@@ -97,9 +108,9 @@ extern "C" __global__ void selectLangevinStepSize(real maxStepSize, real errorTo ...@@ -97,9 +108,9 @@ extern "C" __global__ void selectLangevinStepSize(real maxStepSize, real errorTo
// Recalculate the integration parameters. // Recalculate the integration parameters.
real vscale = exp(-newStepSize/tau); mixed vscale = exp(-newStepSize/tau);
real fscale = (1-vscale)*tau; mixed fscale = (1-vscale)*tau;
real noisescale = sqrt(2*kT/tau)*sqrt(0.5f*(1-vscale*vscale)*tau); mixed noisescale = sqrt(2*kT/tau)*sqrt(0.5f*(1-vscale*vscale)*tau);
params[VelScale] = vscale; params[VelScale] = vscale;
params[ForceScale] = fscale; params[ForceScale] = fscale;
params[NoiseScale] = noisescale; params[NoiseScale] = noisescale;
......
...@@ -2,13 +2,13 @@ ...@@ -2,13 +2,13 @@
* Calculate the center of mass momentum. * Calculate the center of mass momentum.
*/ */
extern "C" __global__ void calcCenterOfMassMomentum(int numAtoms, const real4* __restrict__ velm, float4* __restrict__ cmMomentum) { extern "C" __global__ void calcCenterOfMassMomentum(int numAtoms, const mixed4* __restrict__ velm, float4* __restrict__ cmMomentum) {
extern __shared__ volatile float3 temp[]; extern __shared__ volatile float3 temp[];
float3 cm = make_float3(0, 0, 0); float3 cm = make_float3(0, 0, 0);
for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) { for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
real4 velocity = velm[index]; mixed4 velocity = velm[index];
if (velocity.w != 0.0) { if (velocity.w != 0) {
real mass = RECIP(velocity.w); mixed mass = RECIP(velocity.w);
cm.x += (float) velocity.x*mass; cm.x += (float) velocity.x*mass;
cm.y += (float) velocity.y*mass; cm.y += (float) velocity.y*mass;
cm.z += (float) velocity.z*mass; cm.z += (float) velocity.z*mass;
...@@ -57,7 +57,7 @@ extern "C" __global__ void calcCenterOfMassMomentum(int numAtoms, const real4* _ ...@@ -57,7 +57,7 @@ extern "C" __global__ void calcCenterOfMassMomentum(int numAtoms, const real4* _
* Remove center of mass motion. * Remove center of mass motion.
*/ */
extern "C" __global__ void removeCenterOfMassMomentum(unsigned int numAtoms, real4* __restrict__ velm, const float4* __restrict__ cmMomentum) { extern "C" __global__ void removeCenterOfMassMomentum(unsigned int numAtoms, mixed4* __restrict__ velm, const float4* __restrict__ cmMomentum) {
// First sum all of the momenta that were calculated by individual groups. // First sum all of the momenta that were calculated by individual groups.
extern volatile float3 temp[]; extern volatile float3 temp[];
...@@ -104,7 +104,7 @@ extern "C" __global__ void removeCenterOfMassMomentum(unsigned int numAtoms, rea ...@@ -104,7 +104,7 @@ extern "C" __global__ void removeCenterOfMassMomentum(unsigned int numAtoms, rea
// Now remove the center of mass velocity from each atom. // Now remove the center of mass velocity from each atom.
for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) { for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
real4 velocity = velm[index]; mixed4 velocity = velm[index];
velocity.x -= cm.x; velocity.x -= cm.x;
velocity.y -= cm.y; velocity.y -= cm.y;
velocity.z -= cm.z; velocity.z -= cm.z;
......
...@@ -2,15 +2,22 @@ ...@@ -2,15 +2,22 @@
* Perform the first step of Verlet integration. * Perform the first step of Verlet integration.
*/ */
extern "C" __global__ void integrateVerletPart1(const real2* __restrict__ dt, const real4* __restrict__ posq, real4* __restrict__ velm, const long long* __restrict__ force, real4* __restrict__ posDelta) { extern "C" __global__ void integrateVerletPart1(const mixed2* __restrict__ dt, const real4* __restrict__ posq,
const real2 stepSize = dt[0]; const real4* __restrict__ posqCorrection, mixed4* __restrict__ velm, const long long* __restrict__ force, mixed4* __restrict__ posDelta) {
const real dtPos = stepSize.y; const mixed2 stepSize = dt[0];
const real dtVel = 0.5f*(stepSize.x+stepSize.y); const mixed dtPos = stepSize.y;
const real scale = dtVel/(real) 0xFFFFFFFF; const mixed dtVel = 0.5f*(stepSize.x+stepSize.y);
const mixed scale = dtVel/(mixed) 0xFFFFFFFF;
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) { for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
real4 velocity = velm[index]; mixed4 velocity = velm[index];
if (velocity.w != 0.0) { if (velocity.w != 0.0) {
#ifdef USE_MIXED_PRECISION
real4 pos1 = posq[index];
real4 pos2 = posqCorrection[index];
mixed4 pos = make_mixed4(pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
#else
real4 pos = posq[index]; real4 pos = posq[index];
#endif
velocity.x += scale*force[index]*velocity.w; velocity.x += scale*force[index]*velocity.w;
velocity.y += scale*force[index+PADDED_NUM_ATOMS]*velocity.w; velocity.y += scale*force[index+PADDED_NUM_ATOMS]*velocity.w;
velocity.z += scale*force[index+PADDED_NUM_ATOMS*2]*velocity.w; velocity.z += scale*force[index+PADDED_NUM_ATOMS*2]*velocity.w;
...@@ -27,22 +34,34 @@ extern "C" __global__ void integrateVerletPart1(const real2* __restrict__ dt, co ...@@ -27,22 +34,34 @@ extern "C" __global__ void integrateVerletPart1(const real2* __restrict__ dt, co
* Perform the second step of Verlet integration. * Perform the second step of Verlet integration.
*/ */
extern "C" __global__ void integrateVerletPart2(real2* __restrict__ dt, real4* __restrict__ posq, real4* __restrict__ velm, const real4* __restrict__ posDelta) { extern "C" __global__ void integrateVerletPart2(mixed2* __restrict__ dt, real4* __restrict__ posq,
real2 stepSize = dt[0]; real4* __restrict__ posqCorrection, mixed4* __restrict__ velm, const mixed4* __restrict__ posDelta) {
mixed2 stepSize = dt[0];
double oneOverDt = 1.0/stepSize.y; double oneOverDt = 1.0/stepSize.y;
int index = blockIdx.x*blockDim.x+threadIdx.x; int index = blockIdx.x*blockDim.x+threadIdx.x;
if (index == 0) if (index == 0)
dt[0].x = stepSize.y; dt[0].x = stepSize.y;
for (; index < NUM_ATOMS; index += blockDim.x*gridDim.x) { for (; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
real4 velocity = velm[index]; mixed4 velocity = velm[index];
if (velocity.w != 0.0) { if (velocity.w != 0.0) {
#ifdef USE_MIXED_PRECISION
real4 pos1 = posq[index];
real4 pos2 = posqCorrection[index];
mixed4 pos = make_mixed4(pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
#else
real4 pos = posq[index]; real4 pos = posq[index];
real4 delta = posDelta[index]; #endif
mixed4 delta = posDelta[index];
pos.x += delta.x; pos.x += delta.x;
pos.y += delta.y; pos.y += delta.y;
pos.z += delta.z; pos.z += delta.z;
velocity = make_real4((real) (delta.x*oneOverDt), (real) (delta.y*oneOverDt), (real) (delta.z*oneOverDt), velocity.w); velocity = make_mixed4((mixed) (delta.x*oneOverDt), (mixed) (delta.y*oneOverDt), (mixed) (delta.z*oneOverDt), velocity.w);
#ifdef USE_MIXED_PRECISION
posq[index] = make_real4((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
posqCorrection[index] = make_real4(pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
#else
posq[index] = pos; posq[index] = pos;
#endif
velm[index] = velocity; velm[index] = velocity;
} }
} }
...@@ -52,15 +71,15 @@ extern "C" __global__ void integrateVerletPart2(real2* __restrict__ dt, real4* _ ...@@ -52,15 +71,15 @@ extern "C" __global__ void integrateVerletPart2(real2* __restrict__ dt, real4* _
* Select the step size to use for the next step. * Select the step size to use for the next step.
*/ */
extern "C" __global__ void selectVerletStepSize(real maxStepSize, real errorTol, real2* __restrict__ dt, const real4* __restrict__ velm, const long long* __restrict__ force) { extern "C" __global__ void selectVerletStepSize(mixed maxStepSize, mixed errorTol, mixed2* __restrict__ dt, const mixed4* __restrict__ velm, const long long* __restrict__ force) {
// Calculate the error. // Calculate the error.
extern __shared__ real error[]; extern __shared__ mixed error[];
real err = 0.0f; mixed err = 0.0f;
const real scale = RECIP((real) 0xFFFFFFFF); const mixed scale = RECIP((mixed) 0xFFFFFFFF);
for (int index = threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) { for (int index = threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
real3 f = make_real3(scale*force[index], scale*force[index+PADDED_NUM_ATOMS], scale*force[index+PADDED_NUM_ATOMS*2]); mixed3 f = make_mixed3(scale*force[index], scale*force[index+PADDED_NUM_ATOMS], scale*force[index+PADDED_NUM_ATOMS*2]);
real invMass = velm[index].w; mixed invMass = velm[index].w;
err += (f.x*f.x + f.y*f.y + f.z*f.z)*invMass; err += (f.x*f.x + f.y*f.y + f.z*f.z)*invMass;
} }
error[threadIdx.x] = err; error[threadIdx.x] = err;
...@@ -74,9 +93,9 @@ extern "C" __global__ void selectVerletStepSize(real maxStepSize, real errorTol, ...@@ -74,9 +93,9 @@ extern "C" __global__ void selectVerletStepSize(real maxStepSize, real errorTol,
__syncthreads(); __syncthreads();
} }
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
real totalError = SQRT(error[0]/(NUM_ATOMS*3)); mixed totalError = sqrt(error[0]/(NUM_ATOMS*3));
real newStepSize = SQRT(errorTol/totalError); mixed newStepSize = sqrt(errorTol/totalError);
real oldStepSize = dt[0].y; mixed oldStepSize = dt[0].y;
if (oldStepSize > 0.0f) if (oldStepSize > 0.0f)
newStepSize = min(newStepSize, oldStepSize*2.0f); // For safety, limit how quickly dt can increase. newStepSize = min(newStepSize, oldStepSize*2.0f); // For safety, limit how quickly dt can increase.
if (newStepSize > oldStepSize && newStepSize < 1.1f*oldStepSize) if (newStepSize > oldStepSize && newStepSize < 1.1f*oldStepSize)
......
...@@ -1410,11 +1410,12 @@ void CudaCalcAmoebaMultipoleForceKernel::getElectrostaticPotential(ContextImpl& ...@@ -1410,11 +1410,12 @@ void CudaCalcAmoebaMultipoleForceKernel::getElectrostaticPotential(ContextImpl&
} }
} }
template <class T, class T4> template <class T, class T4, class M4>
void CudaCalcAmoebaMultipoleForceKernel::computeSystemMultipoleMoments(ContextImpl& context, vector<double>& outputMultipoleMoments) { void CudaCalcAmoebaMultipoleForceKernel::computeSystemMultipoleMoments(ContextImpl& context, vector<double>& outputMultipoleMoments) {
// Compute the local coordinates relative to the center of mass. // Compute the local coordinates relative to the center of mass.
int numAtoms = cu.getNumAtoms(); int numAtoms = cu.getNumAtoms();
vector<T4> posq, velm; vector<T4> posq;
vector<M4> velm;
cu.getPosq().download(posq); cu.getPosq().download(posq);
cu.getVelm().download(velm); cu.getVelm().download(velm);
double totalMass = 0.0; double totalMass = 0.0;
...@@ -1524,9 +1525,11 @@ void CudaCalcAmoebaMultipoleForceKernel::computeSystemMultipoleMoments(ContextIm ...@@ -1524,9 +1525,11 @@ void CudaCalcAmoebaMultipoleForceKernel::computeSystemMultipoleMoments(ContextIm
void CudaCalcAmoebaMultipoleForceKernel::getSystemMultipoleMoments(ContextImpl& context, vector<double>& outputMultipoleMoments) { void CudaCalcAmoebaMultipoleForceKernel::getSystemMultipoleMoments(ContextImpl& context, vector<double>& outputMultipoleMoments) {
context.calcForcesAndEnergy(false, false, -1); context.calcForcesAndEnergy(false, false, -1);
if (cu.getUseDoublePrecision()) if (cu.getUseDoublePrecision())
computeSystemMultipoleMoments<double, double4>(context, outputMultipoleMoments); computeSystemMultipoleMoments<double, double4, double4>(context, outputMultipoleMoments);
else if (cu.getUseMixedPrecision())
computeSystemMultipoleMoments<float, float4, double4>(context, outputMultipoleMoments);
else else
computeSystemMultipoleMoments<float, float4>(context, outputMultipoleMoments); computeSystemMultipoleMoments<float, float4, float4>(context, outputMultipoleMoments);
} }
/* -------------------------------------------------------------------------- * /* -------------------------------------------------------------------------- *
......
...@@ -321,7 +321,7 @@ private: ...@@ -321,7 +321,7 @@ private:
const char* getSortKey() const {return "value.y";} const char* getSortKey() const {return "value.y";}
}; };
void initializeScaleFactors(); void initializeScaleFactors();
template <class T, class T4> void computeSystemMultipoleMoments(ContextImpl& context, std::vector<double>& outputMultipoleMoments); template <class T, class T4, class M4> void computeSystemMultipoleMoments(ContextImpl& context, std::vector<double>& outputMultipoleMoments);
int numMultipoles, maxInducedIterations; int numMultipoles, maxInducedIterations;
double inducedEpsilon; double inducedEpsilon;
bool hasInitializedScaleFactors, hasInitializedFFT; bool hasInitializedScaleFactors, hasInitializedFFT;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment