Assorted cleanup and bug fixes

1de2e2a0 · Peter Eastman · 93497df6 · 1de2e2a0 · 1de2e2a0 · 1de2e2a0
Commit 1de2e2a0 authored Oct 18, 2012 by Peter Eastman
9 changed files
--- a/platforms/cuda/src/CudaContext.cpp
+++ b/platforms/cuda/src/CudaContext.cpp
@@ -500,7 +500,7 @@ void CudaContext::clearBuffer(CudaArray& array) {
 void CudaContext::clearBuffer(CUdeviceptr memory, int size) {
    int words = size/4;
    void* args[] = {&memory, &words};
-    executeKernel(clearBufferKernel, args, size, 128);
+    executeKernel(clearBufferKernel, args, words, 128);
 }

 void CudaContext::addAutoclearBuffer(CudaArray& array) {
@@ -827,6 +827,8 @@ void CudaContext::validateMolecules() {
    else if (useMixedPrecision) {
        vector<float4> oldPosq(paddedNumAtoms);
        vector<float4> newPosq(paddedNumAtoms);
+        vector<float4> oldPosqCorrection(paddedNumAtoms);
+        vector<float4> newPosqCorrection(paddedNumAtoms);
        vector<double4> oldVelm(paddedNumAtoms);
        vector<double4> newVelm(paddedNumAtoms);
        posq->download(oldPosq);
@@ -834,10 +836,12 @@ void CudaContext::validateMolecules() {
        for (int i = 0; i < numAtoms; i++) {
            int index = atomIndex[i];
            newPosq[index] = oldPosq[i];
+            newPosqCorrection[index] = oldPosqCorrection[i];
            newVelm[index] = oldVelm[i];
            newCellOffsets[index] = posCellOffsets[i];
        }
        posq->upload(newPosq);
+        posqCorrection->upload(newPosqCorrection);
        velm->upload(newVelm);
    }
    else {
@@ -978,9 +982,9 @@ void CudaContext::reorderAtomsImpl(bool enforcePeriodic) {
        bool useHilbert = (numMolecules > 5000 || atoms.size() > 8); // For small systems, a simple zigzag curve works better than a Hilbert curve.
        Real binWidth;
        if (useHilbert)
-            binWidth = (Real)(max(max(maxx-minx, maxy-miny), maxz-minz)/255.0);
+            binWidth = (Real) (max(max(maxx-minx, maxy-miny), maxz-minz)/255.0);
        else
-            binWidth = (Real)(0.2*nonbonded->getCutoffDistance());
+            binWidth = (Real) (0.2*nonbonded->getCutoffDistance());
        Real invBinWidth = (Real) (1.0/binWidth);
        int xbins = 1 + (int) ((maxx-minx)*invBinWidth);
        int ybins = 1 + (int) ((maxy-miny)*invBinWidth);

--- a/platforms/cuda/src/CudaIntegrationUtilities.cpp
+++ b/platforms/cuda/src/CudaIntegrationUtilities.cpp
@@ -109,12 +109,12 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
        vector<double4> deltas(posDelta->getSize(), make_double4(0.0, 0.0, 0.0, 0.0));
        posDelta->upload(deltas);
        stepSize = CudaArray::create<double2>(context, 1, "stepSize");
-        vector<double2> step(1, make_double2(0.0f, 0.0f));
+        vector<double2> step(1, make_double2(0.0, 0.0));
        stepSize->upload(step);
    }
    else {
        posDelta = CudaArray::create<float4>(context, context.getPaddedNumAtoms(), "posDelta");
-        vector<float4> deltas(posDelta->getSize(), make_float4(0.0, 0.0, 0.0, 0.0));
+        vector<float4> deltas(posDelta->getSize(), make_float4(0.0f, 0.0f, 0.0f, 0.0f));
        posDelta->upload(deltas);
        stepSize = CudaArray::create<float2>(context, 1, "stepSize");
        vector<float2> step(1, make_float2(0.0f, 0.0f));

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -200,7 +200,7 @@ void CudaUpdateStateDataKernel::setPositions(ContextImpl& context, const vector<
            pos.z = (float) p[2];
        }
        for (int i = numParticles; i < cu.getPaddedNumAtoms(); i++)
-            posq[i] = make_float4(0.0, 0.0, 0.0, 0.0);
+            posq[i] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
        cu.getPosq().upload(posq);
    }
    if (cu.getUseMixedPrecision()) {
@@ -214,7 +214,7 @@ void CudaUpdateStateDataKernel::setPositions(ContextImpl& context, const vector<
            c.w = 0;
        }
        for (int i = numParticles; i < cu.getPaddedNumAtoms(); i++)
-            posCorrection[i] = make_float4(0.0, 0.0, 0.0, 0.0);
+            posCorrection[i] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
        cu.getPosqCorrection().upload(posCorrection);
    }
    for (int i = 0; i < (int) cu.getPosCellOffsets().size(); i++)
@@ -275,7 +275,7 @@ void CudaUpdateStateDataKernel::setVelocities(ContextImpl& context, const vector
            vel.z = p[2];
        }
        for (int i = numParticles; i < cu.getPaddedNumAtoms(); i++)
-            velm[i] = make_float4(0.0, 0.0, 0.0, 0.0);
+            velm[i] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
        cu.getVelm().upload(velm);
    }
 }
@@ -310,6 +310,8 @@ void CudaUpdateStateDataKernel::createCheckpoint(ContextImpl& context, ostream&
    cu.setAsCurrent();
    int version = 1;
    stream.write((char*) &version, sizeof(int));
+    int precision = (cu.getUseDoublePrecision() ? 2 : cu.getUseMixedPrecision() ? 1 : 0);
+    stream.write((char*) &precision, sizeof(int));
    double time = cu.getTime();
    stream.write((char*) &time, sizeof(double));
    int stepCount = cu.getStepCount();
@@ -319,6 +321,10 @@ void CudaUpdateStateDataKernel::createCheckpoint(ContextImpl& context, ostream&
    char* buffer = (char*) cu.getPinnedBuffer();
    cu.getPosq().download(buffer);
    stream.write(buffer, cu.getPosq().getSize()*cu.getPosq().getElementSize());
+    if (cu.getUseMixedPrecision()) {
+        cu.getPosqCorrection().download(buffer);
+        stream.write(buffer, cu.getPosqCorrection().getSize()*cu.getPosqCorrection().getElementSize());
+    }
    cu.getVelm().download(buffer);
    stream.write(buffer, cu.getVelm().getSize()*cu.getVelm().getElementSize());
    stream.write((char*) &cu.getAtomIndex()[0], sizeof(int)*cu.getAtomIndex().size());
@@ -335,6 +341,11 @@ void CudaUpdateStateDataKernel::loadCheckpoint(ContextImpl& context, istream& st
    stream.read((char*) &version, sizeof(int));
    if (version != 1)
        throw OpenMMException("Checkpoint was created with a different version of OpenMM");
+    int precision;
+    stream.read((char*) &precision, sizeof(int));
+    int expectedPrecision = (cu.getUseDoublePrecision() ? 2 : cu.getUseMixedPrecision() ? 1 : 0);
+    if (precision != expectedPrecision)
+        throw OpenMMException("Checkpoint was created with a different numeric precision");
    double time;
    stream.read((char*) &time, sizeof(double));
    int stepCount, computeForceCount;
@@ -349,6 +360,10 @@ void CudaUpdateStateDataKernel::loadCheckpoint(ContextImpl& context, istream& st
    char* buffer = (char*) cu.getPinnedBuffer();
    stream.read(buffer, cu.getPosq().getSize()*cu.getPosq().getElementSize());
    cu.getPosq().upload(buffer);
+    if (cu.getUseMixedPrecision()) {
+        stream.read(buffer, cu.getPosqCorrection().getSize()*cu.getPosqCorrection().getElementSize());
+        cu.getPosqCorrection().upload(buffer);
+    }
    stream.read(buffer, cu.getVelm().getSize()*cu.getVelm().getElementSize());
    cu.getVelm().upload(buffer);
    stream.read((char*) &cu.getAtomIndex()[0], sizeof(int)*cu.getAtomIndex().size());

--- a/platforms/cuda/src/kernels/integrationUtilities.cu
+++ b/platforms/cuda/src/kernels/integrationUtilities.cu
@@ -157,8 +157,8 @@ extern "C" __global__ void applyShakeToPositions(int numClusters, mixed tol, con
        mixed4 xpj1 = posDelta[atoms.y];
        mixed4 pos2 = make_mixed4(0);
        mixed4 xpj2 = make_mixed4(0);
-        real invMassCentral = params.x;
-        real avgMass = params.y;
+        float invMassCentral = params.x;
+        float avgMass = params.y;
        float d2 = params.z;
        float invMassPeripheral = params.w;
        if (atoms.z != -1) {
@@ -270,8 +270,8 @@ extern "C" __global__ void applyShakeToVelocities(int numClusters, mixed tol, co
        mixed4 xpj1 = posDelta[atoms.y];
        mixed4 pos2 = make_mixed4(0);
        mixed4 xpj2 = make_mixed4(0);
-        real invMassCentral = params.x;
-        real avgMass = params.y;
+        float invMassCentral = params.x;
+        float avgMass = params.y;
        float d2 = params.z;
        float invMassPeripheral = params.w;
        if (atoms.z != -1) {
@@ -361,7 +361,7 @@ extern "C" __global__ void applyShakeToVelocities(int numClusters, mixed tol, co
 /**
 * Enforce constraints on SETTLE clusters
 */
-extern "C" __global__ void applySettleToPositions(int numClusters, float tol, const real4* __restrict__ oldPos, real4* __restrict__ posCorrection, mixed4* __restrict__ posDelta, const mixed4* __restrict__ velm, const int4* __restrict__ clusterAtoms, const float2* __restrict__ clusterParams) {
+extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, const real4* __restrict__ oldPos, real4* __restrict__ posCorrection, mixed4* __restrict__ posDelta, const mixed4* __restrict__ velm, const int4* __restrict__ clusterAtoms, const float2* __restrict__ clusterParams) {
    int index = blockIdx.x*blockDim.x+threadIdx.x;
    while (index < numClusters) {
        // Load the data for this cluster.
@@ -440,7 +440,7 @@ extern "C" __global__ void applySettleToPositions(int numClusters, float tol, co
        //                                        --- Step2  A2' ---

        float rc = 0.5f*params.y;
-        float rb = sqrt(params.x*params.x-rc*rc);
+        mixed rb = sqrt(params.x*params.x-rc*rc);
        mixed ra = rb*(m1+m2)*invTotalMass;
        rb -= ra;
        mixed sinphi = za1d/ra;
@@ -513,7 +513,7 @@ extern "C" __global__ void applySettleToPositions(int numClusters, float tol, co
 /**
 * Enforce velocity constraints on SETTLE clusters
 */
-extern "C" __global__ void applySettleToVelocities(int numClusters, float tol, const real4* __restrict__ oldPos, real4* __restrict__ posCorrection, mixed4* __restrict__ posDelta, mixed4* __restrict__ velm, const int4* __restrict__ clusterAtoms, const float2* __restrict__ clusterParams) {
+extern "C" __global__ void applySettleToVelocities(int numClusters, mixed tol, const real4* __restrict__ oldPos, real4* __restrict__ posCorrection, mixed4* __restrict__ posDelta, mixed4* __restrict__ velm, const int4* __restrict__ clusterAtoms, const float2* __restrict__ clusterParams) {
    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numClusters; index += blockDim.x*gridDim.x) {
        // Load the data for this cluster.

@@ -603,8 +603,8 @@ extern "C" __global__ void computeCCMAPositionConstraintForce(const int2* __rest
    if (threadIdx.x == 0)
        groupConverged = 1;
    __syncthreads();
-    mixed lowerTol = 1.0f-2.0f*tol+tol*tol;
-    mixed upperTol = 1.0f+2.0f*tol+tol*tol;
+    mixed lowerTol = 1-2*tol+tol*tol;
+    mixed upperTol = 1+2*tol+tol*tol;
    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_CCMA_CONSTRAINTS; index += blockDim.x*gridDim.x) {
        // Compute the force due to this constraint.

@@ -634,7 +634,7 @@ extern "C" __global__ void computeCCMAPositionConstraintForce(const int2* __rest
 * Compute the force applied by each CCMA velocity constraint.
 */
 extern "C" __global__ void computeCCMAVelocityConstraintForce(const int2* __restrict__ constraintAtoms, const mixed4* __restrict__ constraintDistance, const mixed4* __restrict__ atomPositions,
-        const mixed* __restrict__ reducedMass, mixed* __restrict__ delta1, int* __restrict__ converged, float tol, int iteration) {
+        const mixed* __restrict__ reducedMass, mixed* __restrict__ delta1, int* __restrict__ converged, mixed tol, int iteration) {
    __shared__ int groupConverged;
    if (converged[1-iteration%2]) {
        if (blockIdx.x == 0 && threadIdx.x == 0)
@@ -644,8 +644,8 @@ extern "C" __global__ void computeCCMAVelocityConstraintForce(const int2* __rest
    if (threadIdx.x == 0)
        groupConverged = 1;
    __syncthreads();
-    mixed lowerTol = 1.0f-2.0f*tol+tol*tol;
-    mixed upperTol = 1.0f+2.0f*tol+tol*tol;
+    mixed lowerTol = 1-2*tol+tol*tol;
+    mixed upperTol = 1+2*tol+tol*tol;
    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_CCMA_CONSTRAINTS; index += blockDim.x*gridDim.x) {
        // Compute the force due to this constraint.

@@ -654,7 +654,7 @@ extern "C" __global__ void computeCCMAVelocityConstraintForce(const int2* __rest
        mixed4 rp_ij = atomPositions[atoms.x]-atomPositions[atoms.y];
        mixed rrpr = rp_ij.x*dir.x + rp_ij.y*dir.y + rp_ij.z*dir.z;
        mixed d_ij2 = dir.x*dir.x + dir.y*dir.y + dir.z*dir.z;
-        delta1[index] = -2.0f*reducedMass[index]*rrpr/d_ij2;
+        delta1[index] = -2*reducedMass[index]*rrpr/d_ij2;

        // See whether it has converged.


--- a/platforms/cuda/src/kernels/removeCM.cu
+++ b/platforms/cuda/src/kernels/removeCM.cu
@@ -9,9 +9,9 @@ extern "C" __global__ void calcCenterOfMassMomentum(int numAtoms, const mixed4*
        mixed4 velocity = velm[index];
        if (velocity.w != 0) {
            mixed mass = RECIP(velocity.w);
-            cm.x += (float) velocity.x*mass;
-            cm.y += (float) velocity.y*mass;
-            cm.z += (float) velocity.z*mass;
+            cm.x += (float) (velocity.x*mass);
+            cm.y += (float) (velocity.y*mass);
+            cm.z += (float) (velocity.z*mass);
        }
    }


--- a/platforms/cuda/tests/TestCudaAndersenThermostat.cpp
+++ b/platforms/cuda/tests/TestCudaAndersenThermostat.cpp
@@ -84,7 +84,7 @@ void testTemperature() {
    }
    ke /= numSteps;
    double expected = 0.5*numParticles*3*BOLTZ*temp;
-    ASSERT_EQUAL_TOL(expected, ke, 6/std::sqrt((double) numSteps));
+    ASSERT_USUALLY_EQUAL_TOL(expected, ke, 6/std::sqrt((double) numSteps));
 }

 void testConstraints() {
@@ -137,7 +137,7 @@ void testConstraints() {
    }
    ke /= numSteps;
    double expected = 0.5*(numParticles*3-system.getNumConstraints())*BOLTZ*temp;
-    ASSERT_EQUAL_TOL(expected, ke, 6/std::sqrt((double) numSteps));
+    ASSERT_USUALLY_EQUAL_TOL(expected, ke, 6/std::sqrt((double) numSteps));
 }

 void testRandomSeed() {

--- a/platforms/opencl/src/OpenCLContext.cpp
+++ b/platforms/opencl/src/OpenCLContext.cpp
@@ -896,6 +896,7 @@ void OpenCLContext::validateMolecules() {
            newCellOffsets[index] = posCellOffsets[i];
        }
        posq->upload(newPosq);
+        posqCorrection->upload(newPosqCorrection);
        velm->upload(newVelm);
    }
    else {

--- a/platforms/opencl/tests/TestOpenCLAndersenThermostat.cpp
+++ b/platforms/opencl/tests/TestOpenCLAndersenThermostat.cpp
@@ -137,7 +137,7 @@ void testConstraints() {
    }
    ke /= numSteps;
    double expected = 0.5*(numParticles*3-system.getNumConstraints())*BOLTZ*temp;
-    ASSERT_EQUAL_TOL(expected, ke, 6/std::sqrt((double) numSteps));
+    ASSERT_USUALLY_EQUAL_TOL(expected, ke, 6/std::sqrt((double) numSteps));
 }

 void testRandomSeed() {

--- a/platforms/reference/tests/TestReferenceAndersenThermostat.cpp
+++ b/platforms/reference/tests/TestReferenceAndersenThermostat.cpp
@@ -84,7 +84,7 @@ void testTemperature() {
    }
    ke /= numSteps;
    double expected = 0.5*numParticles*3*BOLTZ*temp;
-    ASSERT_EQUAL_TOL(expected, ke, 6/std::sqrt((double) numSteps));
+    ASSERT_USUALLY_EQUAL_TOL(expected, ke, 6/std::sqrt((double) numSteps));
 }

 void testConstraints() {
@@ -137,7 +137,7 @@ void testConstraints() {
    }
    ke /= numSteps;
    double expected = 0.5*(numParticles*3-system.getNumConstraints())*BOLTZ*temp;
-    ASSERT_EQUAL_TOL(expected, ke, 6/std::sqrt((double) numSteps));
+    ASSERT_USUALLY_EQUAL_TOL(expected, ke, 6/std::sqrt((double) numSteps));
 }

 void testRandomSeed() {