Various cleanups to integration code

3749e84b · Peter Eastman · 1ef6e1d8 · 3749e84b · 3749e84b · 3749e84b
Commit 3749e84b authored Jun 24, 2009 by Peter Eastman
11 changed files
--- a/openmmapi/include/openmm/VerletIntegrator.h
+++ b/openmmapi/include/openmm/VerletIntegrator.h
@@ -39,7 +39,7 @@
 namespace OpenMM {

 /**
- * This is an Integrator which simulates a System using the velocity Verlet algorithm.
+ * This is an Integrator which simulates a System using the leap-frog Verlet algorithm.
 */

 class OPENMM_EXPORT VerletIntegrator : public Integrator {

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -488,7 +488,7 @@ void CudaIntegrateLangevinStepKernel::execute(OpenMMContextImpl& context, const
        prevFriction = friction;
        prevStepSize = stepSize;
    }
-    kUpdatePart1(gpu);
+    kLangevinUpdatePart1(gpu);
    kApplyFirstShake(gpu);
    kApplyFirstSettle(gpu);
    kApplyFirstCCMA(gpu);
@@ -497,7 +497,7 @@ void CudaIntegrateLangevinStepKernel::execute(OpenMMContextImpl& context, const
        if (step%data.cmMotionFrequency == 0)
            gpu->bCalculateCM = true;
    }
-    kUpdatePart2(gpu);
+    kLangevinUpdatePart2(gpu);
    kApplySecondShake(gpu);
    kApplySecondSettle(gpu);
    kApplySecondCCMA(gpu);

--- a/platforms/cuda/src/kernels/cudaKernels.h
+++ b/platforms/cuda/src/kernels/cudaKernels.h
@@ -40,14 +40,14 @@ extern void kCalculateObcGbsaForces2(gpuContext gpu);
 extern void kCalculateLocalForces(gpuContext gpu);
 extern void kCalculateAndersenThermostat(gpuContext gpu);
 extern void kReduceBornSumAndForces(gpuContext gpu);
-extern void kUpdatePart1(gpuContext gpu);
 extern void kApplyFirstShake(gpuContext gpu);
 extern void kApplyFirstCCMA(gpuContext gpu);
 extern void kApplyFirstSettle(gpuContext gpu);
-extern void kUpdatePart2(gpuContext gpu);
 extern void kApplySecondShake(gpuContext gpu);
 extern void kApplySecondCCMA(gpuContext gpu);
 extern void kApplySecondSettle(gpuContext gpu);
+extern void kLangevinUpdatePart1(gpuContext gpu);
+extern void kLangevinUpdatePart2(gpuContext gpu);
 extern void kVerletUpdatePart1(gpuContext gpu);
 extern void kVerletUpdatePart2(gpuContext gpu);
 extern void kBrownianUpdatePart1(gpuContext gpu);
@@ -72,8 +72,10 @@ extern void SetCalculateAndersenThermostatSim(gpuContext gpu);
 extern void GetCalculateAndersenThermostatSim(gpuContext gpu);
 extern void SetForcesSim(gpuContext gpu);
 extern void GetForcesSim(gpuContext gpu);
-extern void SetUpdateShakeHSim(gpuContext gpu);
-extern void GetUpdateShakeHSim(gpuContext gpu);
+extern void SetShakeHSim(gpuContext gpu);
+extern void GetShakeHSim(gpuContext gpu);
+extern void SetLangevinUpdateSim(gpuContext gpu);
+extern void GetLangevinUpdateSim(gpuContext gpu);
 extern void SetSettleSim(gpuContext gpu);
 extern void GetSettleSim(gpuContext gpu);
 extern void SetCCMASim(gpuContext gpu);

--- a/platforms/cuda/src/kernels/gpu.cpp
+++ b/platforms/cuda/src/kernels/gpu.cpp
@@ -1757,7 +1757,8 @@ int gpuSetConstants(gpuContext gpu)
    SetCalculateObcGbsaForces2Sim(gpu);
    SetCalculateAndersenThermostatSim(gpu);
    SetForcesSim(gpu);
-    SetUpdateShakeHSim(gpu);
+    SetShakeHSim(gpu);
+    SetLangevinUpdateSim(gpu);
    SetVerletUpdateSim(gpu);
    SetBrownianUpdateSim(gpu);
    SetSettleSim(gpu);

--- a/platforms/cuda/src/kernels/kBrownianUpdate.cu
+++ b/platforms/cuda/src/kernels/kBrownianUpdate.cu
@@ -35,8 +35,6 @@ using namespace std;

 #include "gputypes.h"

-#define DeltaShake
-
 static __constant__ cudaGmxSimulation cSim;

 void SetBrownianUpdateSim(gpuContext gpu)
@@ -66,15 +64,9 @@ __global__ void kBrownianUpdatePart1_kernel()
        float4 force            = cSim.pForce4[pos];

        cSim.pOldPosq[pos]      = apos;
-#ifndef DeltaShake
-        apos.x                 += force.x*cSim.GDT + random4a.x;
-        apos.y                 += force.y*cSim.GDT + random4a.y;
-        apos.z                 += force.z*cSim.GDT + random4a.z;
-#else
        apos.x                  = force.x*cSim.GDT + random4a.x;
        apos.y                  = force.y*cSim.GDT + random4a.y;
        apos.z                  = force.z*cSim.GDT + random4a.z;
-#endif
        cSim.pPosqP[pos]        = apos;
        pos                    += blockDim.x * gridDim.x;
    }
@@ -99,11 +91,6 @@ __global__ void kBrownianUpdatePart2_kernel()
        float4 apos             = cSim.pPosq[pos];
        float4 xPrime           = cSim.pPosqP[pos];

-#ifndef DeltaShake
-        velocity.x              = cSim.oneOverDeltaT*(xPrime.x-apos.x);
-        velocity.y              = cSim.oneOverDeltaT*(xPrime.y-apos.y);
-        velocity.z              = cSim.oneOverDeltaT*(xPrime.z-apos.z);
-#else
        velocity.x              = cSim.oneOverDeltaT*(xPrime.x);
        velocity.y              = cSim.oneOverDeltaT*(xPrime.y);
        velocity.z              = cSim.oneOverDeltaT*(xPrime.z);
@@ -111,7 +98,7 @@ __global__ void kBrownianUpdatePart2_kernel()
        xPrime.x               += apos.x;
        xPrime.y               += apos.y;
        xPrime.z               += apos.z;
-#endif
+
        cSim.pPosq[pos]         = xPrime;
        cSim.pVelm4[pos]        = velocity;
         

--- a/platforms/cuda/src/kernels/kLangevinUpdate.cu
+++ b/platforms/cuda/src/kernels/kLangevinUpdate.cu
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#include <cuda.h>
+#include <vector_functions.h>
+#include <cstdlib>
+#include <string>
+#include <iostream>
+using namespace std;
+
+#include "gputypes.h"
+
+static __constant__ cudaGmxSimulation cSim;
+
+void SetLangevinUpdateSim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
+}
+
+void GetLangevinUpdateSim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
+}
+
+// Include versions of the kernels with and with center of mass motion removal.
+
+#include "kLangevinUpdate.h"
+#define REMOVE_CM
+#include "kLangevinUpdate.h"
+
+void kLangevinUpdatePart1(gpuContext gpu)
+{
+//    printf("kLangevinUpdatePart1\n");
+    if (gpu->bRemoveCM)
+    {
+        kLangevinUpdatePart1CM_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block, gpu->sim.update_threads_per_block * sizeof(float3)>>>();
+        LAUNCHERROR("kLangevinUpdatePart1CM");
+        gpu->bRemoveCM = false;
+    }
+    else
+    {    
+        kLangevinUpdatePart1_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
+        LAUNCHERROR("kLangevinUpdatePart1");
+    }
+}
+
+extern void kGenerateRandoms(gpuContext gpu);
+void kLangevinUpdatePart2(gpuContext gpu)
+{
+//    printf("kLangevinUpdatePart2\n");
+    if (gpu->bCalculateCM)
+    {
+        kLangevinUpdatePart2CM_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block, gpu->sim.update_threads_per_block * sizeof(float3)>>>();
+        LAUNCHERROR("kLangevinUpdatePart2CM");
+        gpu->bCalculateCM = false;
+        gpu->bRemoveCM = true;
+    }
+    else
+    {
+        kLangevinUpdatePart2_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
+        LAUNCHERROR("kLangevinUpdatePart2");
+    }
+    
+    // Update randoms if necessary
+    gpu->iterations++;
+    if (gpu->iterations == gpu->sim.randomIterations)
+    {
+        kGenerateRandoms(gpu);
+        gpu->iterations = 0;
+    }
+}
+
--- a/platforms/cuda/src/kernels/kLangevinUpdate.h
+++ b/platforms/cuda/src/kernels/kLangevinUpdate.h
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+
+/**
+ * This file contains the kernels for Langevin integration.  It is included
+ * several times in kLangevinUpdate.cu with different #defines to generate
+ * different versions of the kernels.
+ */
+
+#ifdef REMOVE_CM
+__global__ void kLangevinUpdatePart1CM_kernel()
+#else
+__global__ void kLangevinUpdatePart1_kernel()
+#endif
+{
+    unsigned int pos    = threadIdx.x + blockIdx.x * blockDim.x;
+    unsigned int rpos   = cSim.pRandomPosition[blockIdx.x];
+#ifdef REMOVE_CM
+    extern __shared__ float3 sCM[];
+    float3 CM           = { 0.0f, 0.0f, 0.0f};
+    float4 CM1          = { 0.0f, 0.0f, 0.0f, 0.0f };
+
+    // Read CM outputs from previous step
+    unsigned int cpos = threadIdx.x;
+    while (cpos < gridDim.x)
+    {
+        CM1             = cSim.pLinearMomentum[cpos];
+        CM.x           += CM1.x;
+        CM.y           += CM1.y;
+        CM.z           += CM1.z;
+        cpos           += blockDim.x;
+    }
+    sCM[threadIdx.x].x  = CM.x;
+    sCM[threadIdx.x].y  = CM.y;
+    sCM[threadIdx.x].z  = CM.z;
+    __syncthreads();
+
+    // Reduce CM
+    unsigned int offset = 1;
+    unsigned int mask   = 1;
+    while (offset < blockDim.x)
+    {
+        if (((threadIdx.x & mask) == 0) && (threadIdx.x + offset < blockDim.x))
+        {
+            sCM[threadIdx.x].x += sCM[threadIdx.x + offset].x;
+            sCM[threadIdx.x].y += sCM[threadIdx.x + offset].y;
+            sCM[threadIdx.x].z += sCM[threadIdx.x + offset].z;
+        }
+        mask = 2 * mask + 1;
+        offset *= 2;
+        __syncthreads();
+    }
+#endif
+
+    while (pos < cSim.atoms)
+    {
+        float4 velocity         = cSim.pVelm4[pos];
+        float4 xVector          = cSim.pxVector4[pos];
+        float4 random4a         = cSim.pRandom4a[rpos + pos];
+        float2 random2a         = cSim.pRandom2a[rpos + pos];
+        float4 apos             = cSim.pPosq[pos];
+        float4 force            = cSim.pForce4[pos];
+
+        float3 Vmh;
+        float sqrtInvMass       = sqrt(velocity.w);
+        Vmh.x                   = xVector.x * cSim.DOverTauC + sqrtInvMass * random4a.x;
+        Vmh.y                   = xVector.y * cSim.DOverTauC + sqrtInvMass * random4a.y;
+        Vmh.z                   = xVector.z * cSim.DOverTauC + sqrtInvMass * random4a.z;
+        float4 vVector;
+        vVector.x               = sqrtInvMass * random4a.w;
+        vVector.y               = sqrtInvMass * random2a.x;
+        vVector.z               = sqrtInvMass * random2a.y;
+        vVector.w               = 0.0f;
+        cSim.pvVector4[pos]     = vVector;
+        velocity.x              = velocity.x * cSim.EM +
+                                  velocity.w * force.x * cSim.TauOneMinusEM +
+                                  vVector.x -
+                                  cSim.EM * Vmh.x;
+        velocity.y              = velocity.y * cSim.EM +
+                                  velocity.w * force.y * cSim.TauOneMinusEM +
+                                  vVector.y -
+                                  cSim.EM * Vmh.y;
+        velocity.z              = velocity.z * cSim.EM +
+                                  velocity.w * force.z * cSim.TauOneMinusEM +
+                                  vVector.z -
+                                  cSim.EM * Vmh.z;
+#ifdef REMOVE_CM
+        velocity.x             -= sCM[0].x;
+        velocity.y             -= sCM[0].y;
+        velocity.z             -= sCM[0].z;
+#endif
+        cSim.pOldPosq[pos]      = apos;
+        apos.x                  = velocity.x * cSim.fix1;
+        apos.y                  = velocity.y * cSim.fix1;
+        apos.z                  = velocity.z * cSim.fix1;
+        cSim.pPosqP[pos]        = apos;
+        cSim.pVelm4[pos]        = velocity;
+        pos                    += blockDim.x * gridDim.x;
+    }
+}
+
+#ifdef REMOVE_CM
+__global__ void kLangevinUpdatePart2CM_kernel()
+#else
+__global__ void kLangevinUpdatePart2_kernel()
+#endif
+{
+    unsigned int pos            = threadIdx.x + blockIdx.x * blockDim.x;
+    unsigned int rpos           = cSim.pRandomPosition[blockIdx.x];
+#ifdef REMOVE_CM
+    extern __shared__ float3 sCM[];
+    float3 CM                   = {0.0f, 0.0f, 0.0f};
+    __syncthreads();
+#endif
+
+    while (pos < cSim.atoms)
+    {
+        float4 velocity         = cSim.pVelm4[pos];
+        float4 xPrime           = cSim.pPosqP[pos];
+        float4 vVector          = cSim.pvVector4[pos];
+        float4 xVector;
+        float4 random4b         = cSim.pRandom4b[rpos + pos];
+        float2 random2b         = cSim.pRandom2b[rpos + pos];
+        float3 Xmh;
+        float sqrtInvMass       = sqrt(velocity.w);
+        velocity.x              = xPrime.x * cSim.oneOverFix1;
+        velocity.y              = xPrime.y * cSim.oneOverFix1;
+        velocity.z              = xPrime.z * cSim.oneOverFix1;
+#ifdef REMOVE_CM
+        float mass              = 1.0f / velocity.w;
+        CM.x                   += mass * velocity.x;
+        CM.y                   += mass * velocity.y;
+        CM.z                   += mass * velocity.z;
+#endif;
+
+        Xmh.x                   = vVector.x * cSim.TauDOverEMMinusOne +
+                                  sqrtInvMass * random4b.x;
+        Xmh.y                   = vVector.y * cSim.TauDOverEMMinusOne +
+                                  sqrtInvMass * random4b.y;
+        Xmh.z                   = vVector.z * cSim.TauDOverEMMinusOne +
+                                  sqrtInvMass * random4b.z;
+        xVector.x               = sqrtInvMass * random4b.w;
+        xVector.y               = sqrtInvMass * random2b.x;
+        xVector.z               = sqrtInvMass * random2b.y;
+        xPrime.x               += xVector.x - Xmh.x;
+        xPrime.y               += xVector.y - Xmh.y;
+        xPrime.z               += xVector.z - Xmh.z;
+
+
+        cSim.pPosq[pos]         = xPrime;
+        cSim.pVelm4[pos]        = velocity;
+        cSim.pxVector4[pos]     = xVector;
+
+        pos                    += blockDim.x * gridDim.x;
+    }
+
+    // Update random position pointer
+    if (threadIdx.x == 0)
+    {
+        rpos                   += cSim.paddedNumberOfAtoms;
+        if (rpos > cSim.randoms)
+            rpos               -= cSim.randoms;
+        cSim.pRandomPosition[blockIdx.x] = rpos;
+    }
+
+#ifdef REMOVE_CM
+    // Scale CM
+    CM.x *= cSim.inverseTotalMass;
+    CM.y *= cSim.inverseTotalMass;
+    CM.z *= cSim.inverseTotalMass;
+    sCM[threadIdx.x] = CM;
+    __syncthreads();
+
+    // Reduce CM for CTA
+    unsigned int offset = 1;
+    unsigned int mask   = 1;
+    while (offset < blockDim.x)
+    {
+        if (((threadIdx.x & mask) == 0) && (threadIdx.x + offset < blockDim.x))
+        {
+            sCM[threadIdx.x].x += sCM[threadIdx.x + offset].x;
+            sCM[threadIdx.x].y += sCM[threadIdx.x + offset].y;
+            sCM[threadIdx.x].z += sCM[threadIdx.x + offset].z;
+        }
+        mask = 2 * mask + 1;
+        offset *= 2;
+        __syncthreads();
+    }
+    if (threadIdx.x == 0)
+    {
+        float4 CM;
+        CM.x                                = sCM[0].x;
+        CM.y                                = sCM[0].y;
+        CM.z                                = sCM[0].z;
+        CM.w                                = 0.0f;
+        cSim.pLinearMomentum[blockIdx.x]    = CM;
+    }
+#endif
+}
+
--- a/platforms/cuda/src/kernels/kSettle.cu
+++ b/platforms/cuda/src/kernels/kSettle.cu
@@ -33,8 +33,6 @@
 //#include <fstream>
 using namespace std;

-#define DeltaShake
-
 #include "gputypes.h"



--- a/platforms/cuda/src/kernels/kUpdateShakeH.cu
+++ b/platforms/cuda/src/kernels/kUpdateShakeH.cu
--- a/platforms/cuda/src/kernels/kVerletUpdate.cu
+++ b/platforms/cuda/src/kernels/kVerletUpdate.cu
@@ -35,8 +35,6 @@ using namespace std;

 #include "gputypes.h"

-#define DeltaShake
-
 static __constant__ cudaGmxSimulation cSim;

 void SetVerletUpdateSim(gpuContext gpu)
@@ -53,102 +51,11 @@ void GetVerletUpdateSim(gpuContext gpu)
    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
 }

-__global__ void kVerletUpdatePart1_kernel()
-{
-    unsigned int pos    = threadIdx.x + blockIdx.x * blockDim.x;
-    
-    while (pos < cSim.atoms)
-    {
-        float4 apos             = cSim.pPosq[pos];
-        float4 velocity         = cSim.pVelm4[pos];
-        float4 force            = cSim.pForce4[pos];
-        float dtOverMass        = cSim.deltaT*velocity.w;
-
-        cSim.pOldPosq[pos]      = apos;        
-        velocity.x             += dtOverMass*force.x;
-        velocity.y             += dtOverMass*force.y;
-        velocity.z             += dtOverMass*force.z;
-
-#ifndef DeltaShake
-        apos.x                 += velocity.x*cSim.deltaT;
-        apos.y                 += velocity.y*cSim.deltaT;
-        apos.z                 += velocity.z*cSim.deltaT;
-#else
-        apos.x                  = velocity.x*cSim.deltaT;
-        apos.y                  = velocity.y*cSim.deltaT;
-        apos.z                  = velocity.z*cSim.deltaT;
-#endif
-        cSim.pPosqP[pos]        = apos;
-        cSim.pVelm4[pos]        = velocity;        
-        pos                    += blockDim.x * gridDim.x;
-    }
-}
-
-__global__ void kVerletUpdatePart1CM_kernel()
-{
-    extern __shared__ float3 sCM[];
-    unsigned int pos    = threadIdx.x + blockIdx.x * blockDim.x;
-    float3 CM           = { 0.0f, 0.0f, 0.0f};
-    float4 CM1          = { 0.0f, 0.0f, 0.0f, 0.0f };
-    
-    // Read CM outputs from previous step
-    unsigned int cpos = threadIdx.x;
-    while (cpos < gridDim.x)
-    {
-        CM1             = cSim.pLinearMomentum[cpos];
-        CM.x           += CM1.x;
-        CM.y           += CM1.y;
-        CM.z           += CM1.z;
-        cpos           += blockDim.x;
-    }
-    sCM[threadIdx.x].x  = CM.x;
-    sCM[threadIdx.x].y  = CM.y;
-    sCM[threadIdx.x].z  = CM.z;
-    __syncthreads();
-    
-    // Reduce CM
-    unsigned int offset = 1;
-    unsigned int mask   = 1;
-    while (offset < blockDim.x)
-    {
-        if (((threadIdx.x & mask) == 0) && (threadIdx.x + offset < blockDim.x))
-        {
-            sCM[threadIdx.x].x += sCM[threadIdx.x + offset].x;
-            sCM[threadIdx.x].y += sCM[threadIdx.x + offset].y;
-            sCM[threadIdx.x].z += sCM[threadIdx.x + offset].z;
-        }
-        mask = 2 * mask + 1;
-        offset *= 2;
-        __syncthreads();
-    }       
-    
-    while (pos < cSim.atoms)
-    {
-        float4 apos             = cSim.pPosq[pos];
-        float4 velocity         = cSim.pVelm4[pos];
-        float4 force            = cSim.pForce4[pos];
-        float dtOverMass        = cSim.deltaT*velocity.w;
+// Include versions of the kernels with and with center of mass motion removal.

-        cSim.pOldPosq[pos]      = apos;        
-        velocity.x             += dtOverMass*force.x-sCM[0].x;
-        velocity.y             += dtOverMass*force.y-sCM[0].y;
-        velocity.z             += dtOverMass*force.z-sCM[0].z;
-
-#ifndef DeltaShake
-        apos.x                 += velocity.x*cSim.deltaT;
-        apos.y                 += velocity.y*cSim.deltaT;
-        apos.z                 += velocity.z*cSim.deltaT;
-#else
-        apos.x                  = velocity.x*cSim.deltaT;
-        apos.y                  = velocity.y*cSim.deltaT;
-        apos.z                  = velocity.z*cSim.deltaT;
-#endif
-
-        cSim.pPosqP[pos]        = apos;
-        cSim.pVelm4[pos]        = velocity;        
-        pos                    += blockDim.x * gridDim.x;
-    }
-}
+#include "kVerletUpdate.h"
+#define REMOVE_CM
+#include "kVerletUpdate.h"

 void kVerletUpdatePart1(gpuContext gpu)
 {
@@ -166,105 +73,6 @@ void kVerletUpdatePart1(gpuContext gpu)
    }
 }

-__global__ void kVerletUpdatePart2_kernel()
-{
-    unsigned int pos            = threadIdx.x + blockIdx.x * blockDim.x;
-    
-    while (pos < cSim.atoms)
-    {
-        float4 velocity         = cSim.pVelm4[pos];
-        float4 apos             = cSim.pPosq[pos];
-        float4 xPrime           = cSim.pPosqP[pos];
-
-#ifndef DeltaShake
-        velocity.x              = cSim.oneOverDeltaT*(xPrime.x-apos.x);
-        velocity.y              = cSim.oneOverDeltaT*(xPrime.y-apos.y);
-        velocity.z              = cSim.oneOverDeltaT*(xPrime.z-apos.z);
-#else
-        velocity.x              = cSim.oneOverDeltaT*(xPrime.x);
-        velocity.y              = cSim.oneOverDeltaT*(xPrime.y);
-        velocity.z              = cSim.oneOverDeltaT*(xPrime.z);
-
-        xPrime.x               += apos.x;
-        xPrime.y               += apos.y;
-        xPrime.z               += apos.z;
-#endif
-        cSim.pPosq[pos]         = xPrime;
-        cSim.pVelm4[pos]        = velocity;
-         
-        pos                    += blockDim.x * gridDim.x;    
-    }
-}
-
-__global__ void kVerletUpdatePart2CM_kernel()
-{
-    extern __shared__ float3 sCM[];
-    unsigned int pos            = threadIdx.x + blockIdx.x * blockDim.x;
-    float3 CM                   = {0.0f, 0.0f, 0.0f};
-    
-    while (pos < cSim.atoms)
-    {
-        float4 velocity         = cSim.pVelm4[pos];
-        float4 apos             = cSim.pPosq[pos];
-        float4 xPrime           = cSim.pPosqP[pos];
-        float mass              = 1.0f / velocity.w;
-
-#ifndef DeltaShake
-        velocity.x              = cSim.oneOverDeltaT*(xPrime.x-apos.x);
-        velocity.y              = cSim.oneOverDeltaT*(xPrime.y-apos.y);
-        velocity.z              = cSim.oneOverDeltaT*(xPrime.z-apos.z);
-#else
-        velocity.x              = cSim.oneOverDeltaT*(xPrime.x);
-        velocity.y              = cSim.oneOverDeltaT*(xPrime.y);
-        velocity.z              = cSim.oneOverDeltaT*(xPrime.z);
-
-        xPrime.x               += apos.x;
-        xPrime.y               += apos.y;
-        xPrime.z               += apos.z;
-#endif
-
-        CM.x                   += mass * velocity.x;
-        CM.y                   += mass * velocity.y;
-        CM.z                   += mass * velocity.z;
-        cSim.pPosq[pos]         = xPrime;
-        cSim.pVelm4[pos]        = velocity;
-         
-        pos                    += blockDim.x * gridDim.x;    
-    }
-    
-    // Scale CM
-    CM.x *= cSim.inverseTotalMass;
-    CM.y *= cSim.inverseTotalMass;
-    CM.z *= cSim.inverseTotalMass;
-    sCM[threadIdx.x] = CM;
-    __syncthreads();
-    
-    // Reduce CM for CTA
-    unsigned int offset = 1;
-    unsigned int mask   = 1;
-    while (offset < blockDim.x)
-    {
-        if (((threadIdx.x & mask) == 0) && (threadIdx.x + offset < blockDim.x))
-        {
-            sCM[threadIdx.x].x += sCM[threadIdx.x + offset].x;
-            sCM[threadIdx.x].y += sCM[threadIdx.x + offset].y;
-            sCM[threadIdx.x].z += sCM[threadIdx.x + offset].z;
-        }
-        mask = 2 * mask + 1;
-        offset *= 2;
-        __syncthreads();
-    }
-    if (threadIdx.x == 0)
-    {
-        float4 CM;
-        CM.x                                = sCM[0].x;
-        CM.y                                = sCM[0].y;
-        CM.z                                = sCM[0].z;
-        CM.w                                = 0.0f;
-        cSim.pLinearMomentum[blockIdx.x]    = CM;
-    }  
-}
-
 void kVerletUpdatePart2(gpuContext gpu)
 {
 //    printf("kVerletUpdatePart2\n");

--- a/platforms/cuda/src/kernels/kVerletUpdate.h
+++ b/platforms/cuda/src/kernels/kVerletUpdate.h
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+/**
+ * This file contains the kernels for Verlet integration.  It is included
+ * several times in kVerletUpdate.cu with different #defines to generate
+ * different versions of the kernels.
+ */
+
+#ifdef REMOVE_CM
+__global__ void kVerletUpdatePart1CM_kernel()
+#else
+__global__ void kVerletUpdatePart1_kernel()
+#endif
+{
+    unsigned int pos    = threadIdx.x + blockIdx.x * blockDim.x;
+#ifdef REMOVE_CM
+    extern __shared__ float3 sCM[];
+    float3 CM           = { 0.0f, 0.0f, 0.0f};
+    float4 CM1          = { 0.0f, 0.0f, 0.0f, 0.0f };
+
+    // Read CM outputs from previous step
+    unsigned int cpos = threadIdx.x;
+    while (cpos < gridDim.x)
+    {
+        CM1             = cSim.pLinearMomentum[cpos];
+        CM.x           += CM1.x;
+        CM.y           += CM1.y;
+        CM.z           += CM1.z;
+        cpos           += blockDim.x;
+    }
+    sCM[threadIdx.x].x  = CM.x;
+    sCM[threadIdx.x].y  = CM.y;
+    sCM[threadIdx.x].z  = CM.z;
+    __syncthreads();
+
+    // Reduce CM
+    unsigned int offset = 1;
+    unsigned int mask   = 1;
+    while (offset < blockDim.x)
+    {
+        if (((threadIdx.x & mask) == 0) && (threadIdx.x + offset < blockDim.x))
+        {
+            sCM[threadIdx.x].x += sCM[threadIdx.x + offset].x;
+            sCM[threadIdx.x].y += sCM[threadIdx.x + offset].y;
+            sCM[threadIdx.x].z += sCM[threadIdx.x + offset].z;
+        }
+        mask = 2 * mask + 1;
+        offset *= 2;
+        __syncthreads();
+    }
+#endif
+    while (pos < cSim.atoms)
+    {
+        float4 apos             = cSim.pPosq[pos];
+        float4 velocity         = cSim.pVelm4[pos];
+        float4 force            = cSim.pForce4[pos];
+        float dtOverMass        = cSim.deltaT*velocity.w;
+
+        cSim.pOldPosq[pos]      = apos;
+        velocity.x             += dtOverMass*force.x;
+        velocity.y             += dtOverMass*force.y;
+        velocity.z             += dtOverMass*force.z;
+#ifdef REMOVE_CM
+        velocity.x             -= sCM[0].x;
+        velocity.y             -= sCM[0].y;
+        velocity.z             -= sCM[0].z;
+#endif
+
+        apos.x                  = velocity.x*cSim.deltaT;
+        apos.y                  = velocity.y*cSim.deltaT;
+        apos.z                  = velocity.z*cSim.deltaT;
+
+        cSim.pPosqP[pos]        = apos;
+        cSim.pVelm4[pos]        = velocity;
+        pos                    += blockDim.x * gridDim.x;
+    }
+}
+
+#ifdef REMOVE_CM
+__global__ void kVerletUpdatePart2CM_kernel()
+#else
+__global__ void kVerletUpdatePart2_kernel()
+#endif
+{
+    unsigned int pos            = threadIdx.x + blockIdx.x * blockDim.x;
+#ifdef REMOVE_CM
+    extern __shared__ float3 sCM[];
+    float3 CM                   = {0.0f, 0.0f, 0.0f};
+#endif
+
+    while (pos < cSim.atoms)
+    {
+        float4 velocity         = cSim.pVelm4[pos];
+        float4 apos             = cSim.pPosq[pos];
+        float4 xPrime           = cSim.pPosqP[pos];
+
+        velocity.x              = cSim.oneOverDeltaT*(xPrime.x);
+        velocity.y              = cSim.oneOverDeltaT*(xPrime.y);
+        velocity.z              = cSim.oneOverDeltaT*(xPrime.z);
+
+        xPrime.x               += apos.x;
+        xPrime.y               += apos.y;
+        xPrime.z               += apos.z;
+
+#ifdef REMOVE_CM
+        float mass              = 1.0f / velocity.w;
+        CM.x                   += mass * velocity.x;
+        CM.y                   += mass * velocity.y;
+        CM.z                   += mass * velocity.z;
+#endif
+        cSim.pPosq[pos]         = xPrime;
+        cSim.pVelm4[pos]        = velocity;
+
+        pos                    += blockDim.x * gridDim.x;
+    }
+
+#ifdef REMOVE_CM
+    // Scale CM
+    CM.x *= cSim.inverseTotalMass;
+    CM.y *= cSim.inverseTotalMass;
+    CM.z *= cSim.inverseTotalMass;
+    sCM[threadIdx.x] = CM;
+    __syncthreads();
+
+    // Reduce CM for CTA
+    unsigned int offset = 1;
+    unsigned int mask   = 1;
+    while (offset < blockDim.x)
+    {
+        if (((threadIdx.x & mask) == 0) && (threadIdx.x + offset < blockDim.x))
+        {
+            sCM[threadIdx.x].x += sCM[threadIdx.x + offset].x;
+            sCM[threadIdx.x].y += sCM[threadIdx.x + offset].y;
+            sCM[threadIdx.x].z += sCM[threadIdx.x + offset].z;
+        }
+        mask = 2 * mask + 1;
+        offset *= 2;
+        __syncthreads();
+    }
+    if (threadIdx.x == 0)
+    {
+        float4 CM;
+        CM.x                                = sCM[0].x;
+        CM.y                                = sCM[0].y;
+        CM.z                                = sCM[0].z;
+        CM.w                                = 0.0f;
+        cSim.pLinearMomentum[blockIdx.x]    = CM;
+    }
+#endif
+}