Improved integration accuracy on devices that don't support double precision

6d7f0273 · peastman · 09970632 · 6d7f0273 · 6d7f0273 · 6d7f0273
Commit 6d7f0273 authored Jul 16, 2013 by peastman
4 changed files
--- a/platforms/cuda/src/kernels/langevin.cu
+++ b/platforms/cuda/src/kernels/langevin.cu
@@ -32,7 +32,12 @@ extern "C" __global__ void integrateLangevinPart1(mixed4* __restrict__ velm, con
 */
 extern "C" __global__ void integrateLangevinPart2(real4* __restrict__ posq, real4* __restrict__ posqCorrection, const mixed4* __restrict__ posDelta, mixed4* __restrict__ velm, const mixed2* __restrict__ dt) {
+#if __CUDA_ARCH__ >= 130
    double invStepSize = 1.0/dt[0].y;
+#else
+    float invStepSize = 1.0f/dt[0].y;
+    float correction = (1.0f-invStepSize*dt[0].y)/dt[0].y;
+#endif
    int index = blockIdx.x*blockDim.x+threadIdx.x;
    while (index < NUM_ATOMS) {
        mixed4 vel = velm[index];
@@ -48,9 +53,15 @@ extern "C" __global__ void integrateLangevinPart2(real4* __restrict__ posq, real
            pos.x += delta.x;
            pos.y += delta.y;
            pos.z += delta.z;
+#if __CUDA_ARCH__ >= 130
            vel.x = (mixed) (invStepSize*delta.x);
            vel.y = (mixed) (invStepSize*delta.y);
            vel.z = (mixed) (invStepSize*delta.z);
+#else
+            vel.x = invStepSize*delta.x + correction*delta.x;
+            vel.y = invStepSize*delta.y + correction*delta.x;
+            vel.z = invStepSize*delta.z + correction*delta.x;
+#endif
 #ifdef USE_MIXED_PRECISION
            posq[index] = make_real4((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
            posqCorrection[index] = make_real4(pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);

--- a/platforms/cuda/src/kernels/verlet.cu
+++ b/platforms/cuda/src/kernels/verlet.cu
@@ -37,7 +37,12 @@ extern "C" __global__ void integrateVerletPart1(const mixed2* __restrict__ dt, c
 extern "C" __global__ void integrateVerletPart2(mixed2* __restrict__ dt, real4* __restrict__ posq,
        real4* __restrict__ posqCorrection, mixed4* __restrict__ velm, const mixed4* __restrict__ posDelta) {
    mixed2 stepSize = dt[0];
+#if __CUDA_ARCH__ >= 130
    double oneOverDt = 1.0/stepSize.y;
+#else
+    float oneOverDt = 1.0f/stepSize.y;
+    float correction = (1.0f-oneOverDt*stepSize.y)/stepSize.y;
+#endif
    int index = blockIdx.x*blockDim.x+threadIdx.x;
    if (index == 0)
        dt[0].x = stepSize.y;
@@ -55,7 +60,11 @@ extern "C" __global__ void integrateVerletPart2(mixed2* __restrict__ dt, real4*
            pos.x += delta.x;
            pos.y += delta.y;
            pos.z += delta.z;
+#if __CUDA_ARCH__ >= 130
            velocity = make_mixed4((mixed) (delta.x*oneOverDt), (mixed) (delta.y*oneOverDt), (mixed) (delta.z*oneOverDt), velocity.w);
+#else
+            velocity = make_mixed4((mixed) (delta.x*oneOverDt+delta.x*correction), (mixed) (delta.y*oneOverDt+delta.y*correction), (mixed) (delta.z*oneOverDt+delta.z*correction), velocity.w);
+#endif
 #ifdef USE_MIXED_PRECISION
            posq[index] = make_real4((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
            posqCorrection[index] = make_real4(pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);

--- a/platforms/opencl/src/kernels/langevin.cl
+++ b/platforms/opencl/src/kernels/langevin.cl
@@ -36,6 +36,7 @@ __kernel void integrateLangevinPart2(__global real4* restrict posq, __global rea
    double invStepSize = 1.0/dt[0].y;
 #else
    float invStepSize = 1.0f/dt[0].y;
+    float correction = (1.0f-invStepSize*dt[0].y)/dt[0].y;
 #endif
    int index = get_global_id(0);
    while (index < NUM_ATOMS) {
@@ -53,7 +54,7 @@ __kernel void integrateLangevinPart2(__global real4* restrict posq, __global rea
 #ifdef SUPPORTS_DOUBLE_PRECISION
            vel.xyz = convert_mixed4(invStepSize*convert_double4(delta)).xyz;
 #else
-            vel.xyz = invStepSize*delta.xyz;
+            vel.xyz = invStepSize*delta.xyz + correction*delta.xyz;
 #endif
 #ifdef USE_MIXED_PRECISION
            posq[index] = convert_real4(pos);

--- a/platforms/opencl/src/kernels/verlet.cl
+++ b/platforms/opencl/src/kernels/verlet.cl
@@ -38,6 +38,7 @@ __kernel void integrateVerletPart2(int numAtoms, __global mixed2* restrict dt, _
    double oneOverDt = 1.0/stepSize.y;
 #else
    float oneOverDt = 1.0f/stepSize.y;
+    float correction = (1.0f-oneOverDt*stepSize.y)/stepSize.y;
 #endif
    if (get_global_id(0) == 0)
        dt[0].x = stepSize.y;
@@ -58,7 +59,7 @@ __kernel void integrateVerletPart2(int numAtoms, __global mixed2* restrict dt, _
 #ifdef SUPPORTS_DOUBLE_PRECISION
            velocity.xyz = convert_mixed4(convert_double4(delta)*oneOverDt).xyz;
 #else
-            velocity.xyz = delta.xyz*oneOverDt;
+            velocity.xyz = delta.xyz*oneOverDt + delta.xyz*correction;
 #endif
 #ifdef USE_MIXED_PRECISION
            posq[index] = convert_real4(pos);