Fixed lots of bugs found by mixed/double precision test cases

490cc912 · Peter Eastman · 3c8adf0c · 490cc912 · 490cc912 · 490cc912
Commit 490cc912 authored Nov 01, 2012 by Peter Eastman
7 changed files
--- a/platforms/cuda/src/CudaContext.cpp
+++ b/platforms/cuda/src/CudaContext.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Portions copyright (c) 2009-2012 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -4446,7 +4446,7 @@ void CudaIntegrateCustomStepKernel::initialize(const System& system, const Custo
    globalValues = new CudaArray(cu, max(1, numGlobalVariables), elementSize, "globalVariables");
    sumBuffer = new CudaArray(cu, 3*system.getNumParticles(), elementSize, "sumBuffer");
    potentialEnergy = new CudaArray(cu, 1, cu.getEnergyBuffer().getElementSize(), "potentialEnergy");
-    kineticEnergy = new CudaArray(cu, 1, cu.getEnergyBuffer().getElementSize(), "kineticEnergy");
+    kineticEnergy = new CudaArray(cu, 1, elementSize, "kineticEnergy");
    perDofValues = new CudaParameterSet(cu, integrator.getNumPerDofVariables(), 3*system.getNumParticles(), "perDofVariables", false, cu.getUseDoublePrecision() || cu.getUseMixedPrecision());
    cu.addReorderListener(new ReorderListener(cu, *perDofValues, localPerDofValuesFloat, localPerDofValuesDouble, deviceValuesAreCurrent));
    prevStepSize = -1.0;
@@ -4840,6 +4840,7 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
            args << ", " << buffer.getType() << "* __restrict__ " << valueName;
        }
        replacements["PARAMETER_ARGUMENTS"] = args.str();
+        defines["SUM_BUFFER_SIZE"] = cu.intToString(3*numAtoms);
        if (defines.find("LOAD_POS_AS_DELTA") != defines.end())
            defines.erase("LOAD_POS_AS_DELTA");
        module = cu.createModule(cu.replaceStrings(CudaKernelSources::customIntegratorPerDof, replacements), defines);

--- a/platforms/opencl/src/OpenCLIntegrationUtilities.cpp
+++ b/platforms/opencl/src/OpenCLIntegrationUtilities.cpp
@@ -942,7 +942,7 @@ void OpenCLIntegrationUtilities::loadCheckpoint(istream& stream) {
 double OpenCLIntegrationUtilities::computeKineticEnergy(double timeShift) {
    int numParticles = context.getNumAtoms();
    double energy = 0.0;
-    if (context.getUseDoublePrecision() || context.getUseMixedPrecision()) {
+    if (context.getUseDoublePrecision()) {
        mm_double4* force = (mm_double4*) context.getPinnedBuffer();
        context.getForce().download(force);
        vector<mm_double4> velm;
@@ -958,6 +958,22 @@ double OpenCLIntegrationUtilities::computeKineticEnergy(double timeShift) {
            }
        }
    }
+    else if (context.getUseMixedPrecision()) {
+        mm_float4* force = (mm_float4*) context.getPinnedBuffer();
+        context.getForce().download(force);
+        vector<mm_double4> velm;
+        context.getVelm().download(velm);
+        for (int i = 0; i < numParticles; i++) {
+            mm_double4 v = velm[i];
+            if (v.w != 0) {
+                double scale = timeShift*v.w;
+                v.x += scale*force[i].x;
+                v.y += scale*force[i].y;
+                v.z += scale*force[i].z;
+                energy += (v.x*v.x+v.y*v.y+v.z*v.z)/v.w;
+            }
+        }
+    }
    else {
        mm_float4* force = (mm_float4*) context.getPinnedBuffer();
        context.getForce().download(force);

--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
@@ -4706,7 +4706,7 @@ void OpenCLIntegrateCustomStepKernel::initialize(const System& system, const Cus
    int elementSize = (cl.getUseDoublePrecision() || cl.getUseMixedPrecision() ? sizeof(double) : sizeof(float));
    globalValues = new OpenCLArray(cl, max(1, numGlobalVariables), elementSize, "globalVariables");
    sumBuffer = new OpenCLArray(cl, 3*system.getNumParticles(), elementSize, "sumBuffer");
-    potentialEnergy = new OpenCLArray(cl, 1, elementSize, "potentialEnergy");
+    potentialEnergy = new OpenCLArray(cl, 1, cl.getEnergyBuffer().getElementSize(), "potentialEnergy");
    kineticEnergy = new OpenCLArray(cl, 1, elementSize, "kineticEnergy");
    perDofValues = new OpenCLParameterSet(cl, integrator.getNumPerDofVariables(), 3*system.getNumParticles(), "perDofVariables", false, cl.getUseDoublePrecision() || cl.getUseMixedPrecision());
    cl.addReorderListener(new ReorderListener(cl, *perDofValues, localPerDofValuesFloat, localPerDofValuesDouble, deviceValuesAreCurrent));

--- a/platforms/opencl/tests/TestOpenCLNonbondedForce.cpp
+++ b/platforms/opencl/tests/TestOpenCLNonbondedForce.cpp
@@ -473,30 +473,46 @@ void testBlockInteractions(bool periodic) {
    // Verify that the bounds of each block were calculated correctly.
-    vector<mm_float4> posq;
+    vector<mm_double4> posq(clcontext.getPosq().getSize());
-    clcontext.getPosq().download(posq);
+    vector<mm_double4> blockCenters(numBlocks);
-    vector<mm_float4> blockCenters(numBlocks);
+    vector<mm_double4> blockBoundingBoxes(numBlocks);
-    vector<mm_float4> blockBoundingBoxes(numBlocks);
+    if (clcontext.getUseDoublePrecision()) {
-    nb.getBlockCenters().download(blockCenters);
+        clcontext.getPosq().download(posq);
-    nb.getBlockBoundingBoxes().download(blockBoundingBoxes);
+        nb.getBlockCenters().download(blockCenters);
+        nb.getBlockBoundingBoxes().download(blockBoundingBoxes);
+    }
+    else {
+        vector<mm_float4> posqf(clcontext.getPosq().getSize());
+        vector<mm_float4> blockCentersf(numBlocks);
+        vector<mm_float4> blockBoundingBoxesf(numBlocks);
+        clcontext.getPosq().download(posqf);
+        nb.getBlockCenters().download(blockCentersf);
+        nb.getBlockBoundingBoxes().download(blockBoundingBoxesf);
+        for (int i = 0; i < numParticles; i++)
+            posq[i] = mm_double4(posqf[i].x, posqf[i].y, posqf[i].z, posqf[i].w);
+        for (int i = 0; i < numBlocks; i++) {
+            blockCenters[i] = mm_double4(blockCentersf[i].x, blockCentersf[i].y, blockCentersf[i].z, blockCentersf[i].w);
+            blockBoundingBoxes[i] = mm_double4(blockBoundingBoxesf[i].x, blockBoundingBoxesf[i].y, blockBoundingBoxesf[i].z, blockBoundingBoxesf[i].w);
+        }
+    }
    for (int i = 0; i < numBlocks; i++) {
-        mm_float4 gridSize = blockBoundingBoxes[i];
+        mm_double4 gridSize = blockBoundingBoxes[i];
-        mm_float4 center = blockCenters[i];
+        mm_double4 center = blockCenters[i];
        if (periodic) {
            ASSERT(gridSize.x < 0.5*boxSize);
            ASSERT(gridSize.y < 0.5*boxSize);
            ASSERT(gridSize.z < 0.5*boxSize);
        }
-        float minx = 0.0, maxx = 0.0, miny = 0.0, maxy = 0.0, minz = 0.0, maxz = 0.0, radius = 0.0;
+        double minx = 0.0, maxx = 0.0, miny = 0.0, maxy = 0.0, minz = 0.0, maxz = 0.0, radius = 0.0;
        for (int j = 0; j < blockSize; j++) {
-            mm_float4 pos = posq[i*blockSize+j];
+            mm_double4 pos = posq[i*blockSize+j];
-            float dx = pos.x-center.x;
+            double dx = pos.x-center.x;
-            float dy = pos.y-center.y;
+            double dy = pos.y-center.y;
-            float dz = pos.z-center.z;
+            double dz = pos.z-center.z;
            if (periodic) {
-                dx -= (float)(floor(0.5+dx/boxSize)*boxSize);
+                dx -= floor(0.5+dx/boxSize)*boxSize;
-                dy -= (float)(floor(0.5+dy/boxSize)*boxSize);
+                dy -= floor(0.5+dy/boxSize)*boxSize;
-                dz -= (float)(floor(0.5+dz/boxSize)*boxSize);
+                dz -= floor(0.5+dz/boxSize)*boxSize;
            }
            ASSERT(abs(dx) < gridSize.x+TOL);
            ASSERT(abs(dy) < gridSize.y+TOL);
@@ -538,21 +554,21 @@ void testBlockInteractions(bool periodic) {
        // Make sure this tile really should have been flagged based on bounding volumes.
-        mm_float4 gridSize1 = blockBoundingBoxes[x];
+        mm_double4 gridSize1 = blockBoundingBoxes[x];
-        mm_float4 gridSize2 = blockBoundingBoxes[y];
+        mm_double4 gridSize2 = blockBoundingBoxes[y];
-        mm_float4 center1 = blockCenters[x];
+        mm_double4 center1 = blockCenters[x];
-        mm_float4 center2 = blockCenters[y];
+        mm_double4 center2 = blockCenters[y];
-        float dx = center1.x-center2.x;
+        double dx = center1.x-center2.x;
-        float dy = center1.y-center2.y;
+        double dy = center1.y-center2.y;
-        float dz = center1.z-center2.z;
+        double dz = center1.z-center2.z;
        if (periodic) {
-            dx -= (float)(floor(0.5+dx/boxSize)*boxSize);
+            dx -= floor(0.5+dx/boxSize)*boxSize;
-            dy -= (float)(floor(0.5+dy/boxSize)*boxSize);
+            dy -= floor(0.5+dy/boxSize)*boxSize;
-            dz -= (float)(floor(0.5+dz/boxSize)*boxSize);
+            dz -= floor(0.5+dz/boxSize)*boxSize;
        }
-        dx = max(0.0f, abs(dx)-gridSize1.x-gridSize2.x);
+        dx = max(0.0, abs(dx)-gridSize1.x-gridSize2.x);
-        dy = max(0.0f, abs(dy)-gridSize1.y-gridSize2.y);
+        dy = max(0.0, abs(dy)-gridSize1.y-gridSize2.y);
-        dz = max(0.0f, abs(dz)-gridSize1.z-gridSize2.z);
+        dz = max(0.0, abs(dz)-gridSize1.z-gridSize2.z);
        ASSERT(sqrt(dx*dx+dy*dy+dz*dz) < cutoff+TOL);
        // Check the interaction flags.
@@ -561,16 +577,16 @@ void testBlockInteractions(bool periodic) {
            unsigned int flags = interactionFlags[i];
            for (int atom2 = 0; atom2 < 32; atom2++) {
                if ((flags & 1) == 0) {
-                    mm_float4 pos2 = posq[y*blockSize+atom2];
+                    mm_double4 pos2 = posq[y*blockSize+atom2];
                    for (int atom1 = 0; atom1 < blockSize; ++atom1) {
-                        mm_float4 pos1 = posq[x*blockSize+atom1];
+                        mm_double4 pos1 = posq[x*blockSize+atom1];
-                        float dx = pos2.x-pos1.x;
+                        double dx = pos2.x-pos1.x;
-                        float dy = pos2.y-pos1.y;
+                        double dy = pos2.y-pos1.y;
-                        float dz = pos2.z-pos1.z;
+                        double dz = pos2.z-pos1.z;
                        if (periodic) {
-                            dx -= (float)(floor(0.5+dx/boxSize)*boxSize);
+                            dx -= floor(0.5+dx/boxSize)*boxSize;
-                            dy -= (float)(floor(0.5+dy/boxSize)*boxSize);
+                            dy -= floor(0.5+dy/boxSize)*boxSize;
-                            dz -= (float)(floor(0.5+dz/boxSize)*boxSize);
+                            dz -= floor(0.5+dz/boxSize)*boxSize;
                        }
                        ASSERT(dx*dx+dy*dy+dz*dz > cutoff*cutoff);
                    }
@@ -587,16 +603,16 @@ void testBlockInteractions(bool periodic) {
            unsigned int y = (unsigned int) std::floor(numBlocks+0.5-std::sqrt((numBlocks+0.5)*(numBlocks+0.5)-2*i));
            unsigned int x = (i-y*numBlocks+y*(y+1)/2);
            for (int atom1 = 0; atom1 < blockSize; ++atom1) {
-                mm_float4 pos1 = posq[x*blockSize+atom1];
+                mm_double4 pos1 = posq[x*blockSize+atom1];
                for (int atom2 = 0; atom2 < blockSize; ++atom2) {
-                    mm_float4 pos2 = posq[y*blockSize+atom2];
+                    mm_double4 pos2 = posq[y*blockSize+atom2];
-                    float dx = pos1.x-pos2.x;
+                    double dx = pos1.x-pos2.x;
-                    float dy = pos1.y-pos2.y;
+                    double dy = pos1.y-pos2.y;
-                    float dz = pos1.z-pos2.z;
+                    double dz = pos1.z-pos2.z;
                    if (periodic) {
-                        dx -= (float)(floor(0.5+dx/boxSize)*boxSize);
+                        dx -= (floor(0.5+dx/boxSize)*boxSize);
-                        dy -= (float)(floor(0.5+dy/boxSize)*boxSize);
+                        dy -= (floor(0.5+dy/boxSize)*boxSize);
-                        dz -= (float)(floor(0.5+dz/boxSize)*boxSize);
+                        dz -= (floor(0.5+dz/boxSize)*boxSize);
                    }
                    ASSERT(dx*dx+dy*dy+dz*dz > cutoff*cutoff);
                }

--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaGk.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaGk.cu
@@ -187,7 +187,7 @@ __device__ void computeOneInteractionB1B2(AtomData2& atom1, volatile AtomData2&
 inline __device__ void loadAtomData2(AtomData2& data, int atom, const real4* __restrict__ posq, const real* __restrict__ labFrameDipole,
        const real* __restrict__ labFrameQuadrupole, const real* __restrict__ inducedDipole, const real* __restrict__ inducedDipolePolar, const real* __restrict__ bornRadius) {
-    float4 atomPosq = posq[atom];
+    real4 atomPosq = posq[atom];
    data.pos = trimTo3(atomPosq);
    data.q = atomPosq.w;
    data.dipole.x = labFrameDipole[atom*3];
@@ -412,7 +412,7 @@ inline __device__ void loadAtomData3(AtomData3& data, int atom, const real4* __r
 __device__ void computeBornChainRuleInteraction(AtomData3& atom1, AtomData3& atom2, real3& force) {
    real third = 1/(real) 3;
    real pi43 = 4*third*M_PI;
-    real factor = -POW(M_PI, third)*POW(6, 2/(real) 3)/9;
+    real factor = -POW(M_PI, third)*POW((real) 6, 2/(real) 3)/9;
    real term = pi43/(atom1.bornRadius*atom1.bornRadius*atom1.bornRadius);
    term = factor/POW(term, 4/(real) 3);

--- a/plugins/amoeba/platforms/cuda/tests/TestCudaAmoebaStretchBendForce.cpp
+++ b/plugins/amoeba/platforms/cuda/tests/TestCudaAmoebaStretchBendForce.cpp
@@ -45,7 +45,7 @@ using namespace OpenMM;
 extern "C" void registerAmoebaCudaKernelFactories();
-const double TOL = 1e-5;
+const double TOL = 1e-4;
 #define PI_M               3.141592653589
 #define RADIAN            57.29577951308
 const double DegreesToRadians = PI_M/180.0;