Various cleanups to integration code

3749e84b · Peter Eastman · 1ef6e1d8 · 3749e84b · 3749e84b · 3749e84b
Commit 3749e84b authored Jun 24, 2009 by Peter Eastman
11 changed files
--- a/openmmapi/include/openmm/VerletIntegrator.h
+++ b/openmmapi/include/openmm/VerletIntegrator.h
@@ -39,7 +39,7 @@
 namespace OpenMM {

 /**
- * This is an Integrator which simulates a System using the velocity Verlet algorithm.
+ * This is an Integrator which simulates a System using the leap-frog Verlet algorithm.
 */

 class OPENMM_EXPORT VerletIntegrator : public Integrator {

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -488,7 +488,7 @@ void CudaIntegrateLangevinStepKernel::execute(OpenMMContextImpl& context, const
        prevFriction = friction;
        prevStepSize = stepSize;
    }
-    kUpdatePart1(gpu);
+    kLangevinUpdatePart1(gpu);
    kApplyFirstShake(gpu);
    kApplyFirstSettle(gpu);
    kApplyFirstCCMA(gpu);
@@ -497,7 +497,7 @@ void CudaIntegrateLangevinStepKernel::execute(OpenMMContextImpl& context, const
        if (step%data.cmMotionFrequency == 0)
            gpu->bCalculateCM = true;
    }
-    kUpdatePart2(gpu);
+    kLangevinUpdatePart2(gpu);
    kApplySecondShake(gpu);
    kApplySecondSettle(gpu);
    kApplySecondCCMA(gpu);

--- a/platforms/cuda/src/kernels/cudaKernels.h
+++ b/platforms/cuda/src/kernels/cudaKernels.h
@@ -40,14 +40,14 @@ extern void kCalculateObcGbsaForces2(gpuContext gpu);
 extern void kCalculateLocalForces(gpuContext gpu);
 extern void kCalculateAndersenThermostat(gpuContext gpu);
 extern void kReduceBornSumAndForces(gpuContext gpu);
-extern void kUpdatePart1(gpuContext gpu);
 extern void kApplyFirstShake(gpuContext gpu);
 extern void kApplyFirstCCMA(gpuContext gpu);
 extern void kApplyFirstSettle(gpuContext gpu);
-extern void kUpdatePart2(gpuContext gpu);
 extern void kApplySecondShake(gpuContext gpu);
 extern void kApplySecondCCMA(gpuContext gpu);
 extern void kApplySecondSettle(gpuContext gpu);
+extern void kLangevinUpdatePart1(gpuContext gpu);
+extern void kLangevinUpdatePart2(gpuContext gpu);
 extern void kVerletUpdatePart1(gpuContext gpu);
 extern void kVerletUpdatePart2(gpuContext gpu);
 extern void kBrownianUpdatePart1(gpuContext gpu);
@@ -72,8 +72,10 @@ extern void SetCalculateAndersenThermostatSim(gpuContext gpu);
 extern void GetCalculateAndersenThermostatSim(gpuContext gpu);
 extern void SetForcesSim(gpuContext gpu);
 extern void GetForcesSim(gpuContext gpu);
-extern void SetUpdateShakeHSim(gpuContext gpu);
-extern void GetUpdateShakeHSim(gpuContext gpu);
+extern void SetShakeHSim(gpuContext gpu);
+extern void GetShakeHSim(gpuContext gpu);
+extern void SetLangevinUpdateSim(gpuContext gpu);
+extern void GetLangevinUpdateSim(gpuContext gpu);
 extern void SetSettleSim(gpuContext gpu);
 extern void GetSettleSim(gpuContext gpu);
 extern void SetCCMASim(gpuContext gpu);

--- a/platforms/cuda/src/kernels/gpu.cpp
+++ b/platforms/cuda/src/kernels/gpu.cpp
@@ -1757,7 +1757,8 @@ int gpuSetConstants(gpuContext gpu)
    SetCalculateObcGbsaForces2Sim(gpu);
    SetCalculateAndersenThermostatSim(gpu);
    SetForcesSim(gpu);
-    SetUpdateShakeHSim(gpu);
+    SetShakeHSim(gpu);
+    SetLangevinUpdateSim(gpu);
    SetVerletUpdateSim(gpu);
    SetBrownianUpdateSim(gpu);
    SetSettleSim(gpu);

--- a/platforms/cuda/src/kernels/kBrownianUpdate.cu
+++ b/platforms/cuda/src/kernels/kBrownianUpdate.cu
@@ -35,8 +35,6 @@ using namespace std;

 #include "gputypes.h"

-#define DeltaShake
-
 static __constant__ cudaGmxSimulation cSim;

 void SetBrownianUpdateSim(gpuContext gpu)
@@ -66,15 +64,9 @@ __global__ void kBrownianUpdatePart1_kernel()
        float4 force            = cSim.pForce4[pos];

        cSim.pOldPosq[pos]      = apos;
-#ifndef DeltaShake
-        apos.x                 += force.x*cSim.GDT + random4a.x;
-        apos.y                 += force.y*cSim.GDT + random4a.y;
-        apos.z                 += force.z*cSim.GDT + random4a.z;
-#else
        apos.x                  = force.x*cSim.GDT + random4a.x;
        apos.y                  = force.y*cSim.GDT + random4a.y;
        apos.z                  = force.z*cSim.GDT + random4a.z;
-#endif
        cSim.pPosqP[pos]        = apos;
        pos                    += blockDim.x * gridDim.x;
    }
@@ -99,11 +91,6 @@ __global__ void kBrownianUpdatePart2_kernel()
        float4 apos             = cSim.pPosq[pos];
        float4 xPrime           = cSim.pPosqP[pos];

-#ifndef DeltaShake
-        velocity.x              = cSim.oneOverDeltaT*(xPrime.x-apos.x);
-        velocity.y              = cSim.oneOverDeltaT*(xPrime.y-apos.y);
-        velocity.z              = cSim.oneOverDeltaT*(xPrime.z-apos.z);
-#else
        velocity.x              = cSim.oneOverDeltaT*(xPrime.x);
        velocity.y              = cSim.oneOverDeltaT*(xPrime.y);
        velocity.z              = cSim.oneOverDeltaT*(xPrime.z);
@@ -111,7 +98,7 @@ __global__ void kBrownianUpdatePart2_kernel()
        xPrime.x               += apos.x;
        xPrime.y               += apos.y;
        xPrime.z               += apos.z;
-#endif
+
        cSim.pPosq[pos]         = xPrime;
        cSim.pVelm4[pos]        = velocity;
         

--- a/platforms/cuda/src/kernels/kLangevinUpdate.cu
+++ b/platforms/cuda/src/kernels/kLangevinUpdate.cu
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#include <cuda.h>
+#include <vector_functions.h>
+#include <cstdlib>
+#include <string>
+#include <iostream>
+using namespace std;
+
+#include "gputypes.h"
+
+static __constant__ cudaGmxSimulation cSim;
+
+void SetLangevinUpdateSim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
+}
+
+void GetLangevinUpdateSim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
+}
+
+// Include versions of the kernels with and with center of mass motion removal.
+
+#include "kLangevinUpdate.h"
+#define REMOVE_CM
+#include "kLangevinUpdate.h"
+
+void kLangevinUpdatePart1(gpuContext gpu)
+{
+//    printf("kLangevinUpdatePart1\n");
+    if (gpu->bRemoveCM)
+    {
+        kLangevinUpdatePart1CM_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block, gpu->sim.update_threads_per_block * sizeof(float3)>>>();
+        LAUNCHERROR("kLangevinUpdatePart1CM");
+        gpu->bRemoveCM = false;
+    }
+    else
+    {    
+        kLangevinUpdatePart1_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
+        LAUNCHERROR("kLangevinUpdatePart1");
+    }
+}
+
+extern void kGenerateRandoms(gpuContext gpu);
+void kLangevinUpdatePart2(gpuContext gpu)
+{
+//    printf("kLangevinUpdatePart2\n");
+    if (gpu->bCalculateCM)
+    {
+        kLangevinUpdatePart2CM_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block, gpu->sim.update_threads_per_block * sizeof(float3)>>>();
+        LAUNCHERROR("kLangevinUpdatePart2CM");
+        gpu->bCalculateCM = false;
+        gpu->bRemoveCM = true;
+    }
+    else
+    {
+        kLangevinUpdatePart2_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
+        LAUNCHERROR("kLangevinUpdatePart2");
+    }
+    
+    // Update randoms if necessary
+    gpu->iterations++;
+    if (gpu->iterations == gpu->sim.randomIterations)
+    {
+        kGenerateRandoms(gpu);
+        gpu->iterations = 0;
+    }
+}
+
--- a/platforms/cuda/src/kernels/kLangevinUpdate.h
+++ b/platforms/cuda/src/kernels/kLangevinUpdate.h
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+
+/**
+ * This file contains the kernels for Langevin integration.  It is included
+ * several times in kLangevinUpdate.cu with different #defines to generate
+ * different versions of the kernels.
+ */
+
+#ifdef REMOVE_CM
+__global__ void kLangevinUpdatePart1CM_kernel()
+#else
+__global__ void kLangevinUpdatePart1_kernel()
+#endif
+{
+    unsigned int pos    = threadIdx.x + blockIdx.x * blockDim.x;
+    unsigned int rpos   = cSim.pRandomPosition[blockIdx.x];
+#ifdef REMOVE_CM
+    extern __shared__ float3 sCM[];
+    float3 CM           = { 0.0f, 0.0f, 0.0f};
+    float4 CM1          = { 0.0f, 0.0f, 0.0f, 0.0f };
+
+    // Read CM outputs from previous step
+    unsigned int cpos = threadIdx.x;
+    while (cpos < gridDim.x)
+    {
+        CM1             = cSim.pLinearMomentum[cpos];
+        CM.x           += CM1.x;
+        CM.y           += CM1.y;
+        CM.z           += CM1.z;
+        cpos           += blockDim.x;
+    }
+    sCM[threadIdx.x].x  = CM.x;
+    sCM[threadIdx.x].y  = CM.y;
+    sCM[threadIdx.x].z  = CM.z;
+    __syncthreads();
+
+    // Reduce CM
+    unsigned int offset = 1;
+    unsigned int mask   = 1;
+    while (offset < blockDim.x)
+    {
+        if (((threadIdx.x & mask) == 0) && (threadIdx.x + offset < blockDim.x))
+        {
+            sCM[threadIdx.x].x += sCM[threadIdx.x + offset].x;
+            sCM[threadIdx.x].y += sCM[threadIdx.x + offset].y;
+            sCM[threadIdx.x].z += sCM[threadIdx.x + offset].z;
+        }
+        mask = 2 * mask + 1;
+        offset *= 2;
+        __syncthreads();
+    }
+#endif
+
+    while (pos < cSim.atoms)
+    {
+        float4 velocity         = cSim.pVelm4[pos];
+        float4 xVector          = cSim.pxVector4[pos];
+        float4 random4a         = cSim.pRandom4a[rpos + pos];
+        float2 random2a         = cSim.pRandom2a[rpos + pos];
+        float4 apos             = cSim.pPosq[pos];
+        float4 force            = cSim.pForce4[pos];
+
+        float3 Vmh;
+        float sqrtInvMass       = sqrt(velocity.w);
+        Vmh.x                   = xVector.x * cSim.DOverTauC + sqrtInvMass * random4a.x;
+        Vmh.y                   = xVector.y * cSim.DOverTauC + sqrtInvMass * random4a.y;
+        Vmh.z                   = xVector.z * cSim.DOverTauC + sqrtInvMass * random4a.z;
+        float4 vVector;
+        vVector.x               = sqrtInvMass * random4a.w;
+        vVector.y               = sqrtInvMass * random2a.x;
+        vVector.z               = sqrtInvMass * random2a.y;
+        vVector.w               = 0.0f;
+        cSim.pvVector4[pos]     = vVector;
+        velocity.x              = velocity.x * cSim.EM +
+                                  velocity.w * force.x * cSim.TauOneMinusEM +
+                                  vVector.x -
+                                  cSim.EM * Vmh.x;
+        velocity.y              = velocity.y * cSim.EM +
+                                  velocity.w * force.y * cSim.TauOneMinusEM +
+                                  vVector.y -
+                                  cSim.EM * Vmh.y;
+        velocity.z              = velocity.z * cSim.EM +
+                                  velocity.w * force.z * cSim.TauOneMinusEM +
+                                  vVector.z -
+                                  cSim.EM * Vmh.z;
+#ifdef REMOVE_CM
+        velocity.x             -= sCM[0].x;
+        velocity.y             -= sCM[0].y;
+        velocity.z             -= sCM[0].z;
+#endif
+        cSim.pOldPosq[pos]      = apos;
+        apos.x                  = velocity.x * cSim.fix1;
+        apos.y                  = velocity.y * cSim.fix1;
+        apos.z                  = velocity.z * cSim.fix1;
+        cSim.pPosqP[pos]        = apos;
+        cSim.pVelm4[pos]        = velocity;
+        pos                    += blockDim.x * gridDim.x;
+    }
+}
+
+#ifdef REMOVE_CM
+__global__ void kLangevinUpdatePart2CM_kernel()
+#else
+__global__ void kLangevinUpdatePart2_kernel()
+#endif
+{
+    unsigned int pos            = threadIdx.x + blockIdx.x * blockDim.x;
+    unsigned int rpos           = cSim.pRandomPosition[blockIdx.x];
+#ifdef REMOVE_CM
+    extern __shared__ float3 sCM[];
+    float3 CM                   = {0.0f, 0.0f, 0.0f};
+    __syncthreads();
+#endif
+
+    while (pos < cSim.atoms)
+    {
+        float4 velocity         = cSim.pVelm4[pos];
+        float4 xPrime           = cSim.pPosqP[pos];
+        float4 vVector          = cSim.pvVector4[pos];
+        float4 xVector;
+        float4 random4b         = cSim.pRandom4b[rpos + pos];
+        float2 random2b         = cSim.pRandom2b[rpos + pos];
+        float3 Xmh;
+        float sqrtInvMass       = sqrt(velocity.w);
+        velocity.x              = xPrime.x * cSim.oneOverFix1;
+        velocity.y              = xPrime.y * cSim.oneOverFix1;
+        velocity.z              = xPrime.z * cSim.oneOverFix1;
+#ifdef REMOVE_CM
+        float mass              = 1.0f / velocity.w;
+        CM.x                   += mass * velocity.x;
+        CM.y                   += mass * velocity.y;
+        CM.z                   += mass * velocity.z;
+#endif;
+
+        Xmh.x                   = vVector.x * cSim.TauDOverEMMinusOne +
+                                  sqrtInvMass * random4b.x;
+        Xmh.y                   = vVector.y * cSim.TauDOverEMMinusOne +
+                                  sqrtInvMass * random4b.y;
+        Xmh.z                   = vVector.z * cSim.TauDOverEMMinusOne +
+                                  sqrtInvMass * random4b.z;
+        xVector.x               = sqrtInvMass * random4b.w;
+        xVector.y               = sqrtInvMass * random2b.x;
+        xVector.z               = sqrtInvMass * random2b.y;
+        xPrime.x               += xVector.x - Xmh.x;
+        xPrime.y               += xVector.y - Xmh.y;
+        xPrime.z               += xVector.z - Xmh.z;
+
+
+        cSim.pPosq[pos]         = xPrime;
+        cSim.pVelm4[pos]        = velocity;
+        cSim.pxVector4[pos]     = xVector;
+
+        pos                    += blockDim.x * gridDim.x;
+    }
+
+    // Update random position pointer
+    if (threadIdx.x == 0)
+    {
+        rpos                   += cSim.paddedNumberOfAtoms;
+        if (rpos > cSim.randoms)
+            rpos               -= cSim.randoms;
+        cSim.pRandomPosition[blockIdx.x] = rpos;
+    }
+
+#ifdef REMOVE_CM
+    // Scale CM
+    CM.x *= cSim.inverseTotalMass;
+    CM.y *= cSim.inverseTotalMass;
+    CM.z *= cSim.inverseTotalMass;
+    sCM[threadIdx.x] = CM;
+    __syncthreads();
+
+    // Reduce CM for CTA
+    unsigned int offset = 1;
+    unsigned int mask   = 1;
+    while (offset < blockDim.x)
+    {
+        if (((threadIdx.x & mask) == 0) && (threadIdx.x + offset < blockDim.x))
+        {
+            sCM[threadIdx.x].x += sCM[threadIdx.x + offset].x;
+            sCM[threadIdx.x].y += sCM[threadIdx.x + offset].y;
+            sCM[threadIdx.x].z += sCM[threadIdx.x + offset].z;
+        }
+        mask = 2 * mask + 1;
+        offset *= 2;
+        __syncthreads();
+    }
+    if (threadIdx.x == 0)
+    {
+        float4 CM;
+        CM.x                                = sCM[0].x;
+        CM.y                                = sCM[0].y;
+        CM.z                                = sCM[0].z;
+        CM.w                                = 0.0f;
+        cSim.pLinearMomentum[blockIdx.x]    = CM;
+    }
+#endif
+}
+
--- a/platforms/cuda/src/kernels/kSettle.cu
+++ b/platforms/cuda/src/kernels/kSettle.cu
@@ -33,8 +33,6 @@
 //#include <fstream>
 using namespace std;

-#define DeltaShake
-
 #include "gputypes.h"



--- a/platforms/cuda/src/kernels/kUpdateShakeH.cu
+++ b/platforms/cuda/src/kernels/kUpdateShakeH.cu
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-
-#include <stdio.h>
-#include <cuda.h>
-#include <vector_functions.h>
-#include <cstdlib>
-#include <string>
-#include <iostream>
-//#include <fstream>
-using namespace std;
-
-#define DeltaShake
-
-#include "gputypes.h"
-
-struct Atom 
-{
-    float3 rij1;
-    float3 rij2;
-    float3 rij3;
-    float  M;
-    float  d2;
-    float  InvMassI;
-    float  rij1sq;
-    float  rij2sq;
-    float  rij3sq;
-};
-
-
-static __constant__ cudaGmxSimulation cSim;
-
-void SetUpdateShakeHSim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
-    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
-}
-
-void GetUpdateShakeHSim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
-    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
-}
-
-__global__ void kUpdatePart1_kernel()
-{
-    unsigned int pos    = threadIdx.x + blockIdx.x * blockDim.x;
-    unsigned int rpos   = cSim.pRandomPosition[blockIdx.x];
-    __syncthreads();
-    
-    while (pos < cSim.atoms)
-    {
-        float4 velocity         = cSim.pVelm4[pos];
-        float4 xVector          = cSim.pxVector4[pos];
-        float4 random4a         = cSim.pRandom4a[rpos + pos];
-        float2 random2a         = cSim.pRandom2a[rpos + pos];
-        float4 apos             = cSim.pPosq[pos];
-        float4 force            = cSim.pForce4[pos];
-        
-        float3 Vmh;
-        float sqrtInvMass       = sqrt(velocity.w);
-        Vmh.x                   = xVector.x * cSim.DOverTauC + sqrtInvMass * random4a.x;
-        Vmh.y                   = xVector.y * cSim.DOverTauC + sqrtInvMass * random4a.y;
-        Vmh.z                   = xVector.z * cSim.DOverTauC + sqrtInvMass * random4a.z;
-        float4 vVector;
-        vVector.x               = sqrtInvMass * random4a.w;
-        vVector.y               = sqrtInvMass * random2a.x;
-        vVector.z               = sqrtInvMass * random2a.y;
-        vVector.w               = 0.0f;
-        cSim.pvVector4[pos]     = vVector;
-        velocity.x              = velocity.x * cSim.EM + 
-                                  velocity.w * force.x * cSim.TauOneMinusEM +
-                                  vVector.x -
-                                  cSim.EM * Vmh.x;
-        velocity.y              = velocity.y * cSim.EM + 
-                                  velocity.w * force.y * cSim.TauOneMinusEM +
-                                  vVector.y -
-                                  cSim.EM * Vmh.y;
-        velocity.z              = velocity.z * cSim.EM + 
-                                  velocity.w * force.z * cSim.TauOneMinusEM +
-                                  vVector.z -
-                                  cSim.EM * Vmh.z;
-        cSim.pOldPosq[pos]      = apos;
-#ifndef DeltaShake
-        apos.x                 += velocity.x * cSim.fix1;
-        apos.y                 += velocity.y * cSim.fix1;
-        apos.z                 += velocity.z * cSim.fix1;
-#else
-        apos.x                  = velocity.x * cSim.fix1;
-        apos.y                  = velocity.y * cSim.fix1;
-        apos.z                  = velocity.z * cSim.fix1;
-#endif
-        cSim.pPosqP[pos]        = apos;
-        cSim.pVelm4[pos]        = velocity;        
-        pos                    += blockDim.x * gridDim.x;
-    }
-}
-
-__global__ void kUpdatePart1CM_kernel()
-{
-    extern __shared__ float3 sCM[];
-    unsigned int pos    = threadIdx.x + blockIdx.x * blockDim.x;
-    unsigned int rpos   = cSim.pRandomPosition[blockIdx.x];
-    float3 CM           = { 0.0f, 0.0f, 0.0f};
-    float4 CM1          = { 0.0f, 0.0f, 0.0f, 0.0f };
-    
-    // Read CM outputs from previous step
-    unsigned int cpos = threadIdx.x;
-#if 0
-    float4 CM2          = { 0.0f, 0.0f, 0.0f, 0.0f };
-    float4 CM3          = { 0.0f, 0.0f, 0.0f, 0.0f };
-    float4 CM4          = { 0.0f, 0.0f, 0.0f, 0.0f };
-    if (cpos < gridDim.x)
-        CM1             = cSim.pLinearMomentum[cpos];
-    cpos               += gridDim.x;
-    if (cpos < gridDim.x)
-        CM2             = cSim.pLinearMomentum[cpos];
-    cpos               += gridDim.x;
-    if (cpos < gridDim.x)
-        CM3             = cSim.pLinearMomentum[cpos];
-    cpos               += gridDim.x;
-    if (cpos < gridDim.x)
-        CM4             = cSim.pLinearMomentum[cpos];
-    sCM[threadIdx.x].x  = CM1.x + CM2.x + CM3.x + CM4.x;
-    sCM[threadIdx.x].y  = CM1.y + CM2.y + CM3.y + CM4.y;
-    sCM[threadIdx.x].z  = CM1.z + CM2.z + CM3.z + CM4.z;
-#else
-    while (cpos < gridDim.x)
-    {
-        CM1             = cSim.pLinearMomentum[cpos];
-        CM.x           += CM1.x;
-        CM.y           += CM1.y;
-        CM.z           += CM1.z;
-        cpos           += blockDim.x;
-    }
-    sCM[threadIdx.x].x  = CM.x;
-    sCM[threadIdx.x].y  = CM.y;
-    sCM[threadIdx.x].z  = CM.z;
-#endif
-    __syncthreads();
-    
-    // Reduce CM
-    unsigned int offset = 1;
-    unsigned int mask   = 1;
-    while (offset < blockDim.x)
-    {
-        if (((threadIdx.x & mask) == 0) && (threadIdx.x + offset < blockDim.x))
-        {
-            sCM[threadIdx.x].x += sCM[threadIdx.x + offset].x;
-            sCM[threadIdx.x].y += sCM[threadIdx.x + offset].y;
-            sCM[threadIdx.x].z += sCM[threadIdx.x + offset].z;
-        }
-        mask = 2 * mask + 1;
-        offset *= 2;
-        __syncthreads();
-    }       
-    
-    while (pos < cSim.atoms)
-    {
-        float4 velocity         = cSim.pVelm4[pos];
-        float4 xVector          = cSim.pxVector4[pos];
-        float4 random4a         = cSim.pRandom4a[rpos + pos];
-        float2 random2a         = cSim.pRandom2a[rpos + pos];
-        float4 apos             = cSim.pPosq[pos];
-        float4 force            = cSim.pForce4[pos];
-        
-        float3 Vmh;
-        float sqrtInvMass       = sqrt(velocity.w);
-        Vmh.x                   = xVector.x * cSim.DOverTauC + sqrtInvMass * random4a.x;
-        Vmh.y                   = xVector.y * cSim.DOverTauC + sqrtInvMass * random4a.y;
-        Vmh.z                   = xVector.z * cSim.DOverTauC + sqrtInvMass * random4a.z;
-        float4 vVector;
-        vVector.x               = sqrtInvMass * random4a.w;
-        vVector.y               = sqrtInvMass * random2a.x;
-        vVector.z               = sqrtInvMass * random2a.y;
-        vVector.w               = 0.0f;
-        cSim.pvVector4[pos]     = vVector;
-        velocity.x              = velocity.x * cSim.EM + 
-                                  velocity.w * force.x * cSim.TauOneMinusEM +
-                                  vVector.x -
-                                  cSim.EM * Vmh.x -
-                                  sCM[0].x;
-        velocity.y              = velocity.y * cSim.EM + 
-                                  velocity.w * force.y * cSim.TauOneMinusEM +
-                                  vVector.y -
-                                  cSim.EM * Vmh.y -
-                                  sCM[0].y;
-        velocity.z              = velocity.z * cSim.EM + 
-                                  velocity.w * force.z * cSim.TauOneMinusEM +
-                                  vVector.z -
-                                  cSim.EM * Vmh.z -
-                                  sCM[0].z;
-        cSim.pOldPosq[pos]      = apos;
-#ifndef DeltaShake
-        apos.x                 += velocity.x * cSim.fix1;
-        apos.y                 += velocity.y * cSim.fix1;
-        apos.z                 += velocity.z * cSim.fix1;
-#else
-        apos.x                  = velocity.x * cSim.fix1;
-        apos.y                  = velocity.y * cSim.fix1;
-        apos.z                  = velocity.z * cSim.fix1;
-#endif
-        cSim.pPosqP[pos]        = apos;
-        cSim.pVelm4[pos]        = velocity;        
-        pos                    += blockDim.x * gridDim.x;
-    }
-}
-
-
-
-void kUpdatePart1(gpuContext gpu)
-{
-//    printf("kUpdatePart1\n");
-#if 0
-    static int iteration = 0;
-    if (iteration == 0)
-    {
-        gpu->psPosq4->Download();
-        gpu->psVelm4->Download();
-        printf("# %d atoms\n", gpu->natoms);
-        for (int i = 0; i < gpu->natoms; i++)
-        {
-            printf("%5d %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f\n", i,
-                gpu->psPosq4->_pSysStream[0][i].x, gpu->psPosq4->_pSysStream[0][i].y,
-                gpu->psPosq4->_pSysStream[0][i].z, gpu->psPosq4->_pSysStream[0][i].w,
-                gpu->psVelm4->_pSysStream[0][i].x, gpu->psVelm4->_pSysStream[0][i].y,
-                gpu->psVelm4->_pSysStream[0][i].z, gpu->psVelm4->_pSysStream[0][i].w
-            );       
-        }
-    }
-    iteration++;
-#endif
-#if 0
-    static const float KILO 		        =    1e3;              		// Thousand
-    static const float BOLTZMANN	        =    1.380658e-23f;            // (J/K)	
-    static const float AVOGADRO	            =    6.0221367e23f;		    // ()		
-    static const float RGAS                 =    BOLTZMANN * AVOGADRO;     // (J/(mol K))
-    static const float BOLTZ                =    (RGAS / KILO);            // (kJ/(mol K)) 
-    static int iteration = 0;
-
-    // Check T
-    if (iteration % 1000 == 0)
-    {
-        gpu->psVelm4->Download();
-        float ke = 0.0f;
-        for (int i = 0; i < gpu->natoms; i++)
-        {
-            float vx = gpu->psVelm4->_pSysStream[0][i].x;
-            float vy = gpu->psVelm4->_pSysStream[0][i].y;
-            float vz = gpu->psVelm4->_pSysStream[0][i].z;
-            float m = 1.0f / gpu->psVelm4->_pSysStream[0][i].w;
-            ke += m * (vx * vx + vy * vy + vz * vz);
-        }
-        float T = ke / (BOLTZ  * gpu->sim.degreesOfFreedom);
-        printf("Iteration %d, Temperature is %f\n", iteration, T);
-    }
-    iteration++;
-#endif    
-    if (gpu->bRemoveCM)
-    {
-        kUpdatePart1CM_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block, gpu->sim.update_threads_per_block * sizeof(float3)>>>();
-        LAUNCHERROR("kUpdatePart1CM");
-        gpu->bRemoveCM = false;
-
-#if 0
-        gpu->psLinearMomentum->Download();
-        gpu->psVelm4->Download();
-        float3 mv = {0.0f, 0.0f, 0.0f};
-        for (int i = 0; i < gpu->natoms; i++)
-        {
-            float mass = 1.0f / gpu->psVelm4->_pSysStream[0][i].w;
-            mv.x += mass * gpu->psVelm4->_pSysStream[0][i].x;
-            mv.y += mass * gpu->psVelm4->_pSysStream[0][i].y;
-            mv.z += mass * gpu->psVelm4->_pSysStream[0][i].z;
-        }
-        mv.x *= gpu->sim.inverseTotalMass;
-        mv.y *= gpu->sim.inverseTotalMass;
-        mv.z *= gpu->sim.inverseTotalMass;
-        
-        float3 mv1 = {0.0f, 0.0f, 0.0f};
-        for (int i = 0; i < gpu->sim.blocks; i++)
-        {
-            mv1.x += gpu->psLinearMomentum->_pSysStream[0][i].x;
-            mv1.y += gpu->psLinearMomentum->_pSysStream[0][i].y;
-            mv1.z += gpu->psLinearMomentum->_pSysStream[0][i].z;
-        }
-        printf("%11.5f %11.5f %11.5f | %11.5f %11.5f %11.5f\n", mv.x, mv.y, mv.z, mv1.x, mv1.y, mv1.z);
-#endif
-    }
-    else
-    {    
-        kUpdatePart1_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
-        LAUNCHERROR("kUpdatePart1");
-    }
-}
-
-__global__ void kApplyFirstShake_kernel()
-{
-    __shared__ Atom sA[G8X_THREADS_PER_BLOCK];
-    Atom* psA = &sA[threadIdx.x];
-    unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
-    while (pos < cSim.ShakeConstraints)
-    {
-        int4 atomID         = cSim.pShakeID[pos];
-        float4 params       = cSim.pShakeParameter[pos];
-        float4 apos         = cSim.pOldPosq[atomID.x];
-        float4 xpi          = cSim.pPosqP[atomID.x];
-        float4 apos1        = cSim.pOldPosq[atomID.y];
-        float4 xpj1         = cSim.pPosqP[atomID.y];
-        float4 apos2        = {0.0f, 0.0f, 0.0f, 0.0f};
-        float4 xpj2         = {0.0f, 0.0f, 0.0f, 0.0f};
-        psA->InvMassI       = params.x;
-        psA->M              = params.y;
-        psA->d2             = params.z;
-        float invMassJ      = params.w;
-        if (atomID.z != -1)
-        {
-            apos2           = cSim.pOldPosq[atomID.z]; 
-            xpj2            = cSim.pPosqP[atomID.z];
-        }   
-        float4 apos3        = {0.0f, 0.0f, 0.0f, 0.0f};
-        float4 xpj3         = {0.0f, 0.0f, 0.0f, 0.0f};
-        if (atomID.w != -1)
-        {
-            apos3           = cSim.pOldPosq[atomID.w]; 
-            xpj3            = cSim.pPosqP[atomID.w];
-        }    
-       
-        float3 xi, xj1, xj2, xj3;
-        xi.x                = apos.x;
-        xi.y                = apos.y;
-        xi.z                = apos.z;
-        xj1.x               = apos1.x;
-        xj1.y               = apos1.y;
-        xj1.z               = apos1.z;
-        xj2.x               = apos2.x;
-        xj2.y               = apos2.y;
-        xj2.z               = apos2.z;
-        xj3.x               = apos3.x;
-        xj3.y               = apos3.y;
-        xj3.z               = apos3.z;
-#ifndef DeltaShake
-        xpi.x              -= xi.x;
-        xpi.y              -= xi.y;
-        xpi.z              -= xi.z;
-        xpj1.x             -= xj1.x;
-        xpj1.y             -= xj1.y;
-        xpj1.z             -= xj1.z;
-        xpj2.x             -= xj2.x;
-        xpj2.y             -= xj2.y;
-        xpj2.z             -= xj2.z;
-        xpj3.x             -= xj3.x;
-        xpj3.y             -= xj3.y;
-        xpj3.z             -= xj3.z;
-#endif
-        psA->rij1.x         = xi.x - xj1.x;
-        psA->rij1.y         = xi.y - xj1.y;
-        psA->rij1.z         = xi.z - xj1.z;
-        psA->rij2.x         = xi.x - xj2.x;
-        psA->rij2.y         = xi.y - xj2.y;
-        psA->rij2.z         = xi.z - xj2.z;
-        psA->rij3.x         = xi.x - xj3.x;
-        psA->rij3.y         = xi.y - xj3.y;
-        psA->rij3.z         = xi.z - xj3.z;
-        psA->rij1sq         = psA->rij1.x * psA->rij1.x + psA->rij1.y * psA->rij1.y + psA->rij1.z * psA->rij1.z;
-        psA->rij2sq         = psA->rij2.x * psA->rij2.x + psA->rij2.y * psA->rij2.y + psA->rij2.z * psA->rij2.z;
-        psA->rij3sq         = psA->rij3.x * psA->rij3.x + psA->rij3.y * psA->rij3.y + psA->rij3.z * psA->rij3.z;
-        float ld1           = psA->d2 - psA->rij1sq;
-        float ld2           = psA->d2 - psA->rij2sq;
-        float ld3           = psA->d2 - psA->rij3sq;
-        
-        
-        bool converged = false;
-        int iteration = 0;
-        while (iteration < 15 && !converged)
-        {
-            converged = true;
-            float3 rpij;
-            rpij.x          = xpi.x - xpj1.x;
-            rpij.y          = xpi.y - xpj1.y;
-            rpij.z          = xpi.z - xpj1.z;
-		    float rpsqij    = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
-		    float rrpr      = psA->rij1.x * rpij.x + psA->rij1.y * rpij.y + psA->rij1.z * rpij.z; 
-		    float diff      = fabs(ld1 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance);
-            if (diff >= 1.0f)
-            {
-                float acor  = (ld1 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij1sq);
-                float3 dr;
-                dr.x    = psA->rij1.x * acor;
-                dr.y    = psA->rij1.y * acor;
-                dr.z    = psA->rij1.z * acor;
-		        xpi.x  += dr.x * psA->InvMassI;
-		        xpi.y  += dr.y * psA->InvMassI;
-		        xpi.z  += dr.z * psA->InvMassI;
-		        xpj1.x -= dr.x * invMassJ;
-		        xpj1.y -= dr.y * invMassJ;
-		        xpj1.z -= dr.z * invMassJ;
-                converged = false;
-            }
-            
-            if (atomID.z != -1)
-            {
-                rpij.x          = xpi.x - xpj2.x;
-                rpij.y          = xpi.y - xpj2.y;
-                rpij.z          = xpi.z - xpj2.z;
-		        rpsqij          = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
-		        rrpr            = psA->rij2.x * rpij.x + psA->rij2.y * rpij.y + psA->rij2.z * rpij.z; 
-		        diff            = fabs(ld2 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance);
-                if (diff >= 1.0f)
-                {
-                    float acor  = (ld2 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij2sq);
-                    float3 dr;
-                    dr.x    = psA->rij2.x * acor;
-                    dr.y    = psA->rij2.y * acor;
-                    dr.z    = psA->rij2.z * acor;
-		            xpi.x  += dr.x * psA->InvMassI;
-		            xpi.y  += dr.y * psA->InvMassI;
-		            xpi.z  += dr.z * psA->InvMassI;
-		            xpj2.x -= dr.x * invMassJ;
-		            xpj2.y -= dr.y * invMassJ;
-		            xpj2.z -= dr.z * invMassJ;
-                    converged = false;
-                }
-            }
-            
-            if (atomID.w != -1)
-            {
-                rpij.x          = xpi.x - xpj3.x;
-                rpij.y          = xpi.y - xpj3.y;
-                rpij.z          = xpi.z - xpj3.z;
-		        rpsqij          = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
-		        rrpr            = psA->rij3.x * rpij.x + psA->rij3.y * rpij.y + psA->rij3.z * rpij.z; 
-		        diff            = fabs(ld3 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance);
-                if (diff >= 1.0f)
-                {
-                    float acor  = (ld3 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij3sq);
-                    float3 dr;
-                    dr.x    = psA->rij3.x * acor;
-                    dr.y    = psA->rij3.y * acor;
-                    dr.z    = psA->rij3.z * acor;
-		            xpi.x  += dr.x * psA->InvMassI;
-		            xpi.y  += dr.y * psA->InvMassI;
-		            xpi.z  += dr.z * psA->InvMassI;
-		            xpj3.x -= dr.x * invMassJ;
-		            xpj3.y -= dr.y * invMassJ;
-		            xpj3.z -= dr.z * invMassJ;
-                    converged = false;
-                }
-            }
-            iteration++;
-        }
-        
-#ifndef DeltaShake
-        xpi.x  += xi.x;
-        xpi.y  += xi.y;
-        xpi.z  += xi.z;
-
-        xpj1.x += xj1.x;
-        xpj1.y += xj1.y;
-        xpj1.z += xj1.z;
-        
-        xpj2.x += xj2.x;
-        xpj2.y += xj2.y;
-        xpj2.z += xj2.z;
-        
-        xpj3.x += xj3.x;
-        xpj3.y += xj3.y;
-        xpj3.z += xj3.z;
-#endif
-        cSim.pPosqP[atomID.x] = xpi;
-        cSim.pPosqP[atomID.y] = xpj1;
-        if (atomID.z != -1)
-            cSim.pPosqP[atomID.z] = xpj2;
-        if (atomID.w != -1)
-            cSim.pPosqP[atomID.w] = xpj3;     
-   
-        pos += blockDim.x * gridDim.x;
-    }
-}
-
-void kApplyFirstShake(gpuContext gpu)
-{
-//    printf("kApplyFirstShake\n");       
-    if (gpu->sim.ShakeConstraints > 0)
-    {    
-        kApplyFirstShake_kernel<<<gpu->sim.blocks, gpu->sim.shake_threads_per_block>>>();
-        LAUNCHERROR("kApplyFirstShake");
-    }
-}
-
-
-__global__ void kUpdatePart2_kernel()
-{
-    unsigned int pos            = threadIdx.x + blockIdx.x * blockDim.x;
-    unsigned int rpos           = cSim.pRandomPosition[blockIdx.x];
-    __syncthreads();
-    
-    while (pos < cSim.atoms)
-    {
-        float4 velocity         = cSim.pVelm4[pos];
-#ifndef DeltaShake
-        float4 apos             = cSim.pPosq[pos];
-#endif
-        float4 xPrime           = cSim.pPosqP[pos];
-        float4 vVector          = cSim.pvVector4[pos];
-        float4 xVector;
-        float4 random4b         = cSim.pRandom4b[rpos + pos];
-        float2 random2b         = cSim.pRandom2b[rpos + pos];
-        float3 Xmh;
-        
-        float sqrtInvMass       = sqrt(velocity.w);
-#ifdef DeltaShake
-        velocity.x              = xPrime.x * cSim.oneOverFix1;
-        velocity.y              = xPrime.y * cSim.oneOverFix1;
-        velocity.z              = xPrime.z * cSim.oneOverFix1;
-#else
-        velocity.x              = (xPrime.x - apos.x) * cSim.oneOverFix1;
-        velocity.y              = (xPrime.y - apos.y) * cSim.oneOverFix1;
-        velocity.z              = (xPrime.z - apos.z) * cSim.oneOverFix1;
-#endif
-        Xmh.x                   = vVector.x * cSim.TauDOverEMMinusOne +
-                                  sqrtInvMass * random4b.x;
-        Xmh.y                   = vVector.y * cSim.TauDOverEMMinusOne +
-                                  sqrtInvMass * random4b.y;
-        Xmh.z                   = vVector.z * cSim.TauDOverEMMinusOne +
-                                  sqrtInvMass * random4b.z;
-        xVector.x               = sqrtInvMass * random4b.w;
-        xVector.y               = sqrtInvMass * random2b.x;
-        xVector.z               = sqrtInvMass * random2b.y;                    
-        xPrime.x               += xVector.x - Xmh.x;
-        xPrime.y               += xVector.y - Xmh.y;
-        xPrime.z               += xVector.z - Xmh.z;
-        
-    
-        cSim.pPosq[pos]         = xPrime;
-        cSim.pVelm4[pos]        = velocity;
-        cSim.pxVector4[pos]     = xVector;
-         
-        pos                    += blockDim.x * gridDim.x;    
-    }
-
-    // Update random position pointer
-    if (threadIdx.x == 0)
-    {
-        rpos                   += cSim.paddedNumberOfAtoms;
-        if (rpos > cSim.randoms)
-            rpos               -= cSim.randoms;
-        cSim.pRandomPosition[blockIdx.x] = rpos;
-    }
-}
-
-__global__ void kUpdatePart2CM_kernel()
-{
-    extern __shared__ float3 sCM[];
-    unsigned int pos            = threadIdx.x + blockIdx.x * blockDim.x;
-    unsigned int rpos           = cSim.pRandomPosition[blockIdx.x];
-    float3 CM                   = {0.0f, 0.0f, 0.0f};
-    __syncthreads();
-    
-    while (pos < cSim.atoms)
-    {
-        float4 velocity         = cSim.pVelm4[pos];
-#ifndef DeltaShake
-        float4 apos             = cSim.pPosq[pos];
-#endif
-        float4 xPrime           = cSim.pPosqP[pos];
-        float4 vVector          = cSim.pvVector4[pos];
-        float4 xVector;
-        float4 random4b         = cSim.pRandom4b[rpos + pos];
-        float2 random2b         = cSim.pRandom2b[rpos + pos];
-        float3 Xmh;
-        float mass              = 1.0f / velocity.w;
-        float sqrtInvMass       = sqrt(velocity.w);
-#ifdef DeltaShake
-        velocity.x              = xPrime.x * cSim.oneOverFix1;
-        velocity.y              = xPrime.y * cSim.oneOverFix1;
-        velocity.z              = xPrime.z * cSim.oneOverFix1;
-#else
-        velocity.x              = (xPrime.x - apos.x) * cSim.oneOverFix1;
-        velocity.y              = (xPrime.y - apos.y) * cSim.oneOverFix1;
-        velocity.z              = (xPrime.z - apos.z) * cSim.oneOverFix1;
-#endif
-        CM.x                   += mass * velocity.x;
-        CM.y                   += mass * velocity.y;
-        CM.z                   += mass * velocity.z;
-        
-        Xmh.x                   = vVector.x * cSim.TauDOverEMMinusOne +
-                                  sqrtInvMass * random4b.x;
-        Xmh.y                   = vVector.y * cSim.TauDOverEMMinusOne +
-                                  sqrtInvMass * random4b.y;
-        Xmh.z                   = vVector.z * cSim.TauDOverEMMinusOne +
-                                  sqrtInvMass * random4b.z;
-        xVector.x               = sqrtInvMass * random4b.w;
-        xVector.y               = sqrtInvMass * random2b.x;
-        xVector.z               = sqrtInvMass * random2b.y;                    
-        xPrime.x               += xVector.x - Xmh.x;
-        xPrime.y               += xVector.y - Xmh.y;
-        xPrime.z               += xVector.z - Xmh.z;
-        
-    
-        cSim.pPosq[pos]         = xPrime;
-        cSim.pVelm4[pos]        = velocity;
-        cSim.pxVector4[pos]     = xVector;
-        
-        pos                    += blockDim.x * gridDim.x;    
-    }
-
-    // Update random position pointer
-    if (threadIdx.x == 0)
-    {
-        rpos                   += cSim.paddedNumberOfAtoms;
-        if (rpos > cSim.randoms)
-            rpos               -= cSim.randoms;
-        cSim.pRandomPosition[blockIdx.x] = rpos;
-    }
-    
-    // Scale CM
-    CM.x *= cSim.inverseTotalMass;
-    CM.y *= cSim.inverseTotalMass;
-    CM.z *= cSim.inverseTotalMass;
-    sCM[threadIdx.x] = CM;
-    __syncthreads();
-    
-    // Reduce CM for CTA
-    unsigned int offset = 1;
-    unsigned int mask   = 1;
-    while (offset < blockDim.x)
-    {
-        if (((threadIdx.x & mask) == 0) && (threadIdx.x + offset < blockDim.x))
-        {
-            sCM[threadIdx.x].x += sCM[threadIdx.x + offset].x;
-            sCM[threadIdx.x].y += sCM[threadIdx.x + offset].y;
-            sCM[threadIdx.x].z += sCM[threadIdx.x + offset].z;
-        }
-        mask = 2 * mask + 1;
-        offset *= 2;
-        __syncthreads();
-    }
-    if (threadIdx.x == 0)
-    {
-        float4 CM;
-        CM.x                                = sCM[0].x;
-        CM.y                                = sCM[0].y;
-        CM.z                                = sCM[0].z;
-        CM.w                                = 0.0f;
-        cSim.pLinearMomentum[blockIdx.x]    = CM;
-    }  
-}
-
-extern void kGenerateRandoms(gpuContext gpu);
-void kUpdatePart2(gpuContext gpu)
-{
-//    printf("kUpdatePart2\n");
-    if (gpu->bCalculateCM)
-    {
-        kUpdatePart2CM_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block, gpu->sim.update_threads_per_block * sizeof(float3)>>>();
-        LAUNCHERROR("kUpdatePart2CM");
-        gpu->bCalculateCM = false;
-        gpu->bRemoveCM = true;
-       
-#if 0
-        gpu->psLinearMomentum->Download();
-        gpu->psVelm4->Download();
-        float3 mv = {0.0f, 0.0f, 0.0f};
-        for (int i = 0; i < gpu->natoms; i++)
-        {
-            float mass = 1.0f / gpu->psVelm4->_pSysStream[0][i].w;
-            mv.x += mass * gpu->psVelm4->_pSysStream[0][i].x;
-            mv.y += mass * gpu->psVelm4->_pSysStream[0][i].y;
-            mv.z += mass * gpu->psVelm4->_pSysStream[0][i].z;
-        }
-        mv.x *= gpu->sim.inverseTotalMass;
-        mv.y *= gpu->sim.inverseTotalMass;
-        mv.z *= gpu->sim.inverseTotalMass;
-        
-        float3 mv1 = {0.0f, 0.0f, 0.0f};
-        for (int i = 0; i < gpu->sim.blocks; i++)
-        {
-            mv1.x += gpu->psLinearMomentum->_pSysStream[0][i].x;
-            mv1.y += gpu->psLinearMomentum->_pSysStream[0][i].y;
-            mv1.z += gpu->psLinearMomentum->_pSysStream[0][i].z;
-        }
-        printf("%11.5f %11.5f %11.5f | %11.5f %11.5f %11.5f\n", mv.x, mv.y, mv.z, mv1.x, mv1.y, mv1.z);
-#endif
-    }
-    else
-    {
-        kUpdatePart2_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
-        LAUNCHERROR("kUpdatePart2");
-    }
-    
-    // Update randoms if necessary
-    gpu->iterations++;
-    if (gpu->iterations == gpu->sim.randomIterations)
-    {
-        kGenerateRandoms(gpu);
-        gpu->iterations = 0;
-    }
-}
-
-
-__global__ void kApplySecondShake_kernel()
-{
-    __shared__ Atom sA[G8X_THREADS_PER_BLOCK];
-    Atom* psA = &sA[threadIdx.x];
-    unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
-    while (pos < cSim.ShakeConstraints)
-    {
-        int4 atomID         = cSim.pShakeID[pos];
-        float4 params       = cSim.pShakeParameter[pos];
-        float4 apos         = cSim.pOldPosq[atomID.x];
-        float4 xpi          = cSim.pPosq[atomID.x];
-        float4 apos1        = cSim.pOldPosq[atomID.y];
-        float4 xpj1         = cSim.pPosq[atomID.y];
-        float4 apos2        = {0.0f, 0.0f, 0.0f, 0.0f};
-        float4 xpj2         = {0.0f, 0.0f, 0.0f, 0.0f};
-        psA->InvMassI       = params.x;
-        psA->M              = params.y;
-        psA->d2             = params.z;
-        float invMassJ      = params.w;
-        if (atomID.z != -1)
-        {
-            apos2           = cSim.pOldPosq[atomID.z]; 
-            xpj2            = cSim.pPosq[atomID.z];
-        }   
-        float4 apos3        = {0.0f, 0.0f, 0.0f, 0.0f};
-        float4 xpj3         = {0.0f, 0.0f, 0.0f, 0.0f};
-        if (atomID.w != -1)
-        {
-            apos3           = cSim.pOldPosq[atomID.w]; 
-            xpj3            = cSim.pPosq[atomID.w];
-        }    
-       
-        float3 xi, xj1, xj2, xj3;
-        xi.x                = apos.x;
-        xi.y                = apos.y;
-        xi.z                = apos.z;
-        xj1.x               = apos1.x;
-        xj1.y               = apos1.y;
-        xj1.z               = apos1.z;
-        xj2.x               = apos2.x;
-        xj2.y               = apos2.y;
-        xj2.z               = apos2.z;
-        xj3.x               = apos3.x;
-        xj3.y               = apos3.y;
-        xj3.z               = apos3.z;
-#ifndef DeltaShake
-        xpi.x              -= xi.x;
-        xpi.y              -= xi.y;
-        xpi.z              -= xi.z;
-        xpj1.x             -= xj1.x;
-        xpj1.y             -= xj1.y;
-        xpj1.z             -= xj1.z;
-        xpj2.x             -= xj2.x;
-        xpj2.y             -= xj2.y;
-        xpj2.z             -= xj2.z;
-        xpj3.x             -= xj3.x;
-        xpj3.y             -= xj3.y;
-        xpj3.z             -= xj3.z;
-#endif
-        psA->rij1.x         = xi.x - xj1.x;
-        psA->rij1.y         = xi.y - xj1.y;
-        psA->rij1.z         = xi.z - xj1.z;
-        psA->rij2.x         = xi.x - xj2.x;
-        psA->rij2.y         = xi.y - xj2.y;
-        psA->rij2.z         = xi.z - xj2.z;
-        psA->rij3.x         = xi.x - xj3.x;
-        psA->rij3.y         = xi.y - xj3.y;
-        psA->rij3.z         = xi.z - xj3.z;
-        psA->rij1sq         = psA->rij1.x * psA->rij1.x + psA->rij1.y * psA->rij1.y + psA->rij1.z * psA->rij1.z;
-        psA->rij2sq         = psA->rij2.x * psA->rij2.x + psA->rij2.y * psA->rij2.y + psA->rij2.z * psA->rij2.z;
-        psA->rij3sq         = psA->rij3.x * psA->rij3.x + psA->rij3.y * psA->rij3.y + psA->rij3.z * psA->rij3.z;
-        float ld1           = psA->d2 - psA->rij1sq;
-        float ld2           = psA->d2 - psA->rij2sq;
-        float ld3           = psA->d2 - psA->rij3sq;
-        
-        
-        bool converged = false;
-        int iteration = 0;
-        while (iteration < 15 && !converged)
-        {
-            converged = true;
-            float3 rpij;
-            rpij.x          = xpi.x - xpj1.x;
-            rpij.y          = xpi.y - xpj1.y;
-            rpij.z          = xpi.z - xpj1.z;
-		    float rpsqij    = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
-		    float rrpr      = psA->rij1.x * rpij.x + psA->rij1.y * rpij.y + psA->rij1.z * rpij.z; 
-		    float diff      = fabs(ld1 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance );
-            if (diff >= 1.0f)
-            {
-                float acor  = (ld1 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij1sq);
-                float3 dr;
-                dr.x    = psA->rij1.x * acor;
-                dr.y    = psA->rij1.y * acor;
-                dr.z    = psA->rij1.z * acor;
-		        xpi.x  += dr.x * psA->InvMassI;
-		        xpi.y  += dr.y * psA->InvMassI;
-		        xpi.z  += dr.z * psA->InvMassI;
-		        xpj1.x -= dr.x * invMassJ;
-		        xpj1.y -= dr.y * invMassJ;
-		        xpj1.z -= dr.z * invMassJ;
-                converged = false;
-            }
-            
-            if (atomID.z != -1)
-            {
-                rpij.x          = xpi.x - xpj2.x;
-                rpij.y          = xpi.y - xpj2.y;
-                rpij.z          = xpi.z - xpj2.z;
-		        rpsqij          = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
-		        rrpr            = psA->rij2.x * rpij.x + psA->rij2.y * rpij.y + psA->rij2.z * rpij.z; 
-		        diff            = fabs(ld2 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance );
-                if (diff >= 1.0f)
-                {
-                    float acor  = (ld2 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij2sq);
-                    float3 dr;
-                    dr.x    = psA->rij2.x * acor;
-                    dr.y    = psA->rij2.y * acor;
-                    dr.z    = psA->rij2.z * acor;
-		            xpi.x  += dr.x * psA->InvMassI;
-		            xpi.y  += dr.y * psA->InvMassI;
-		            xpi.z  += dr.z * psA->InvMassI;
-		            xpj2.x -= dr.x * invMassJ;
-		            xpj2.y -= dr.y * invMassJ;
-		            xpj2.z -= dr.z * invMassJ;
-                    converged = false;
-                }
-            }
-            
-            if (atomID.w != -1)
-            {
-                rpij.x          = xpi.x - xpj3.x;
-                rpij.y          = xpi.y - xpj3.y;
-                rpij.z          = xpi.z - xpj3.z;
-		        rpsqij          = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
-		        rrpr            = psA->rij3.x * rpij.x + psA->rij3.y * rpij.y + psA->rij3.z * rpij.z; 
-		        diff            = fabs(ld3 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance );
-                if (diff >= 1.0f)
-                {
-                    float acor  = (ld3 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij3sq);
-                    float3 dr;
-                    dr.x    = psA->rij3.x * acor;
-                    dr.y    = psA->rij3.y * acor;
-                    dr.z    = psA->rij3.z * acor;
-		            xpi.x  += dr.x * psA->InvMassI;
-		            xpi.y  += dr.y * psA->InvMassI;
-		            xpi.z  += dr.z * psA->InvMassI;
-		            xpj3.x -= dr.x * invMassJ;
-		            xpj3.y -= dr.y * invMassJ;
-		            xpj3.z -= dr.z * invMassJ;
-                    converged = false;
-                }
-            }
-            iteration++;
-        }
-        
-        xpi.x += xi.x;
-        xpi.y += xi.y;
-        xpi.z += xi.z;
-        xpj1.x += xj1.x;
-        xpj1.y += xj1.y;
-        xpj1.z += xj1.z;
-        xpj2.x += xj2.x;
-        xpj2.y += xj2.y;
-        xpj2.z += xj2.z;
-        xpj3.x += xj3.x;
-        xpj3.y += xj3.y;
-        xpj3.z += xj3.z;
-
-        cSim.pPosq[atomID.x] = xpi;
-        cSim.pPosq[atomID.y] = xpj1;
-
-        if (atomID.z != -1)
-            cSim.pPosq[atomID.z] = xpj2;
-
-        if (atomID.w != -1)
-            cSim.pPosq[atomID.w] = xpj3;     
-   
-        pos += blockDim.x * gridDim.x;
-    }
-}
-
-__global__ void kApplyNoShake_kernel()
-{
-    unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
-    while (pos < cSim.NonShakeConstraints)
-    {
-        int  atomID          = cSim.pNonShakeID[pos];
-        float4 apos          = cSim.pOldPosq[atomID];
-        float4 xpi           = cSim.pPosq[atomID];
-        xpi.x               += apos.x;
-        xpi.y               += apos.y;
-        xpi.z               += apos.z;
-        cSim.pPosq[atomID]   = xpi;
-
-        pos += blockDim.x * gridDim.x;
-    }
-}
-
-void kCPUShake2(gpuContext gpu)
-{
-
-}
-
-void kApplySecondShake(gpuContext gpu)
-{
-  //  printf("kApplySecondShake\n");
-  //  kCPUShake2(gpu);
-    if (gpu->sim.ShakeConstraints > 0)
-    {
-        kApplySecondShake_kernel<<<gpu->sim.blocks, gpu->sim.shake_threads_per_block>>>();
-        LAUNCHERROR("kApplySecondShake");
-    }
-
-    // handle non-Shake atoms
-
-#ifdef DeltaShake
-    if (gpu->sim.NonShakeConstraints > 0)
-    {
-        //fprintf( gpu->log, "kApplyNoShake_kernel %d %d \n", gpu->sim.blocks, gpu->sim.nonshake_threads_per_block); fflush( gpu->log );
-        kApplyNoShake_kernel<<<gpu->sim.blocks, gpu->sim.nonshake_threads_per_block>>>();
-        LAUNCHERROR("kApplyNoShake");
-    }
-#endif
-
-}
-
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#include <cuda.h>
+#include <vector_functions.h>
+#include <cstdlib>
+#include <string>
+#include <iostream>
+using namespace std;
+
+#include "gputypes.h"
+
+struct Atom
+{
+    float3 rij1;
+    float3 rij2;
+    float3 rij3;
+    float  M;
+    float  d2;
+    float  InvMassI;
+    float  rij1sq;
+    float  rij2sq;
+    float  rij3sq;
+};
+
+
+static __constant__ cudaGmxSimulation cSim;
+
+void SetShakeHSim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
+    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
+}
+
+void GetShakeHSim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
+    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
+}
+
+__global__ void kApplyFirstShake_kernel()
+{
+    __shared__ Atom sA[G8X_THREADS_PER_BLOCK];
+    Atom* psA = &sA[threadIdx.x];
+    unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
+    while (pos < cSim.ShakeConstraints)
+    {
+        int4 atomID         = cSim.pShakeID[pos];
+        float4 params       = cSim.pShakeParameter[pos];
+        float4 apos         = cSim.pOldPosq[atomID.x];
+        float4 xpi          = cSim.pPosqP[atomID.x];
+        float4 apos1        = cSim.pOldPosq[atomID.y];
+        float4 xpj1         = cSim.pPosqP[atomID.y];
+        float4 apos2        = {0.0f, 0.0f, 0.0f, 0.0f};
+        float4 xpj2         = {0.0f, 0.0f, 0.0f, 0.0f};
+        psA->InvMassI       = params.x;
+        psA->M              = params.y;
+        psA->d2             = params.z;
+        float invMassJ      = params.w;
+        if (atomID.z != -1)
+        {
+            apos2           = cSim.pOldPosq[atomID.z];
+            xpj2            = cSim.pPosqP[atomID.z];
+        }
+        float4 apos3        = {0.0f, 0.0f, 0.0f, 0.0f};
+        float4 xpj3         = {0.0f, 0.0f, 0.0f, 0.0f};
+        if (atomID.w != -1)
+        {
+            apos3           = cSim.pOldPosq[atomID.w];
+            xpj3            = cSim.pPosqP[atomID.w];
+        }
+
+        float3 xi, xj1, xj2, xj3;
+        xi.x                = apos.x;
+        xi.y                = apos.y;
+        xi.z                = apos.z;
+        xj1.x               = apos1.x;
+        xj1.y               = apos1.y;
+        xj1.z               = apos1.z;
+        xj2.x               = apos2.x;
+        xj2.y               = apos2.y;
+        xj2.z               = apos2.z;
+        xj3.x               = apos3.x;
+        xj3.y               = apos3.y;
+        xj3.z               = apos3.z;
+        psA->rij1.x         = xi.x - xj1.x;
+        psA->rij1.y         = xi.y - xj1.y;
+        psA->rij1.z         = xi.z - xj1.z;
+        psA->rij2.x         = xi.x - xj2.x;
+        psA->rij2.y         = xi.y - xj2.y;
+        psA->rij2.z         = xi.z - xj2.z;
+        psA->rij3.x         = xi.x - xj3.x;
+        psA->rij3.y         = xi.y - xj3.y;
+        psA->rij3.z         = xi.z - xj3.z;
+        psA->rij1sq         = psA->rij1.x * psA->rij1.x + psA->rij1.y * psA->rij1.y + psA->rij1.z * psA->rij1.z;
+        psA->rij2sq         = psA->rij2.x * psA->rij2.x + psA->rij2.y * psA->rij2.y + psA->rij2.z * psA->rij2.z;
+        psA->rij3sq         = psA->rij3.x * psA->rij3.x + psA->rij3.y * psA->rij3.y + psA->rij3.z * psA->rij3.z;
+        float ld1           = psA->d2 - psA->rij1sq;
+        float ld2           = psA->d2 - psA->rij2sq;
+        float ld3           = psA->d2 - psA->rij3sq;
+
+
+        bool converged = false;
+        int iteration = 0;
+        while (iteration < 15 && !converged)
+        {
+            converged = true;
+            float3 rpij;
+            rpij.x          = xpi.x - xpj1.x;
+            rpij.y          = xpi.y - xpj1.y;
+            rpij.z          = xpi.z - xpj1.z;
+		    float rpsqij    = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
+		    float rrpr      = psA->rij1.x * rpij.x + psA->rij1.y * rpij.y + psA->rij1.z * rpij.z;
+		    float diff      = fabs(ld1 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance);
+            if (diff >= 1.0f)
+            {
+                float acor  = (ld1 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij1sq);
+                float3 dr;
+                dr.x    = psA->rij1.x * acor;
+                dr.y    = psA->rij1.y * acor;
+                dr.z    = psA->rij1.z * acor;
+		        xpi.x  += dr.x * psA->InvMassI;
+		        xpi.y  += dr.y * psA->InvMassI;
+		        xpi.z  += dr.z * psA->InvMassI;
+		        xpj1.x -= dr.x * invMassJ;
+		        xpj1.y -= dr.y * invMassJ;
+		        xpj1.z -= dr.z * invMassJ;
+                converged = false;
+            }
+
+            if (atomID.z != -1)
+            {
+                rpij.x          = xpi.x - xpj2.x;
+                rpij.y          = xpi.y - xpj2.y;
+                rpij.z          = xpi.z - xpj2.z;
+		        rpsqij          = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
+		        rrpr            = psA->rij2.x * rpij.x + psA->rij2.y * rpij.y + psA->rij2.z * rpij.z;
+		        diff            = fabs(ld2 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance);
+                if (diff >= 1.0f)
+                {
+                    float acor  = (ld2 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij2sq);
+                    float3 dr;
+                    dr.x    = psA->rij2.x * acor;
+                    dr.y    = psA->rij2.y * acor;
+                    dr.z    = psA->rij2.z * acor;
+		            xpi.x  += dr.x * psA->InvMassI;
+		            xpi.y  += dr.y * psA->InvMassI;
+		            xpi.z  += dr.z * psA->InvMassI;
+		            xpj2.x -= dr.x * invMassJ;
+		            xpj2.y -= dr.y * invMassJ;
+		            xpj2.z -= dr.z * invMassJ;
+                    converged = false;
+                }
+            }
+
+            if (atomID.w != -1)
+            {
+                rpij.x          = xpi.x - xpj3.x;
+                rpij.y          = xpi.y - xpj3.y;
+                rpij.z          = xpi.z - xpj3.z;
+		        rpsqij          = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
+		        rrpr            = psA->rij3.x * rpij.x + psA->rij3.y * rpij.y + psA->rij3.z * rpij.z;
+		        diff            = fabs(ld3 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance);
+                if (diff >= 1.0f)
+                {
+                    float acor  = (ld3 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij3sq);
+                    float3 dr;
+                    dr.x    = psA->rij3.x * acor;
+                    dr.y    = psA->rij3.y * acor;
+                    dr.z    = psA->rij3.z * acor;
+		            xpi.x  += dr.x * psA->InvMassI;
+		            xpi.y  += dr.y * psA->InvMassI;
+		            xpi.z  += dr.z * psA->InvMassI;
+		            xpj3.x -= dr.x * invMassJ;
+		            xpj3.y -= dr.y * invMassJ;
+		            xpj3.z -= dr.z * invMassJ;
+                    converged = false;
+                }
+            }
+            iteration++;
+        }
+        cSim.pPosqP[atomID.x] = xpi;
+        cSim.pPosqP[atomID.y] = xpj1;
+        if (atomID.z != -1)
+            cSim.pPosqP[atomID.z] = xpj2;
+        if (atomID.w != -1)
+            cSim.pPosqP[atomID.w] = xpj3;
+
+        pos += blockDim.x * gridDim.x;
+    }
+}
+
+void kApplyFirstShake(gpuContext gpu)
+{
+//    printf("kApplyFirstShake\n");
+    if (gpu->sim.ShakeConstraints > 0)
+    {
+        kApplyFirstShake_kernel<<<gpu->sim.blocks, gpu->sim.shake_threads_per_block>>>();
+        LAUNCHERROR("kApplyFirstShake");
+    }
+}
+
+__global__ void kApplySecondShake_kernel()
+{
+    __shared__ Atom sA[G8X_THREADS_PER_BLOCK];
+    Atom* psA = &sA[threadIdx.x];
+    unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
+    while (pos < cSim.ShakeConstraints)
+    {
+        int4 atomID         = cSim.pShakeID[pos];
+        float4 params       = cSim.pShakeParameter[pos];
+        float4 apos         = cSim.pOldPosq[atomID.x];
+        float4 xpi          = cSim.pPosq[atomID.x];
+        float4 apos1        = cSim.pOldPosq[atomID.y];
+        float4 xpj1         = cSim.pPosq[atomID.y];
+        float4 apos2        = {0.0f, 0.0f, 0.0f, 0.0f};
+        float4 xpj2         = {0.0f, 0.0f, 0.0f, 0.0f};
+        psA->InvMassI       = params.x;
+        psA->M              = params.y;
+        psA->d2             = params.z;
+        float invMassJ      = params.w;
+        if (atomID.z != -1)
+        {
+            apos2           = cSim.pOldPosq[atomID.z];
+            xpj2            = cSim.pPosq[atomID.z];
+        }
+        float4 apos3        = {0.0f, 0.0f, 0.0f, 0.0f};
+        float4 xpj3         = {0.0f, 0.0f, 0.0f, 0.0f};
+        if (atomID.w != -1)
+        {
+            apos3           = cSim.pOldPosq[atomID.w];
+            xpj3            = cSim.pPosq[atomID.w];
+        }
+
+        float3 xi, xj1, xj2, xj3;
+        xi.x                = apos.x;
+        xi.y                = apos.y;
+        xi.z                = apos.z;
+        xj1.x               = apos1.x;
+        xj1.y               = apos1.y;
+        xj1.z               = apos1.z;
+        xj2.x               = apos2.x;
+        xj2.y               = apos2.y;
+        xj2.z               = apos2.z;
+        xj3.x               = apos3.x;
+        xj3.y               = apos3.y;
+        xj3.z               = apos3.z;
+        psA->rij1.x         = xi.x - xj1.x;
+        psA->rij1.y         = xi.y - xj1.y;
+        psA->rij1.z         = xi.z - xj1.z;
+        psA->rij2.x         = xi.x - xj2.x;
+        psA->rij2.y         = xi.y - xj2.y;
+        psA->rij2.z         = xi.z - xj2.z;
+        psA->rij3.x         = xi.x - xj3.x;
+        psA->rij3.y         = xi.y - xj3.y;
+        psA->rij3.z         = xi.z - xj3.z;
+        psA->rij1sq         = psA->rij1.x * psA->rij1.x + psA->rij1.y * psA->rij1.y + psA->rij1.z * psA->rij1.z;
+        psA->rij2sq         = psA->rij2.x * psA->rij2.x + psA->rij2.y * psA->rij2.y + psA->rij2.z * psA->rij2.z;
+        psA->rij3sq         = psA->rij3.x * psA->rij3.x + psA->rij3.y * psA->rij3.y + psA->rij3.z * psA->rij3.z;
+        float ld1           = psA->d2 - psA->rij1sq;
+        float ld2           = psA->d2 - psA->rij2sq;
+        float ld3           = psA->d2 - psA->rij3sq;
+
+
+        bool converged = false;
+        int iteration = 0;
+        while (iteration < 15 && !converged)
+        {
+            converged = true;
+            float3 rpij;
+            rpij.x          = xpi.x - xpj1.x;
+            rpij.y          = xpi.y - xpj1.y;
+            rpij.z          = xpi.z - xpj1.z;
+		    float rpsqij    = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
+		    float rrpr      = psA->rij1.x * rpij.x + psA->rij1.y * rpij.y + psA->rij1.z * rpij.z;
+		    float diff      = fabs(ld1 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance );
+            if (diff >= 1.0f)
+            {
+                float acor  = (ld1 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij1sq);
+                float3 dr;
+                dr.x    = psA->rij1.x * acor;
+                dr.y    = psA->rij1.y * acor;
+                dr.z    = psA->rij1.z * acor;
+		        xpi.x  += dr.x * psA->InvMassI;
+		        xpi.y  += dr.y * psA->InvMassI;
+		        xpi.z  += dr.z * psA->InvMassI;
+		        xpj1.x -= dr.x * invMassJ;
+		        xpj1.y -= dr.y * invMassJ;
+		        xpj1.z -= dr.z * invMassJ;
+                converged = false;
+            }
+
+            if (atomID.z != -1)
+            {
+                rpij.x          = xpi.x - xpj2.x;
+                rpij.y          = xpi.y - xpj2.y;
+                rpij.z          = xpi.z - xpj2.z;
+		        rpsqij          = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
+		        rrpr            = psA->rij2.x * rpij.x + psA->rij2.y * rpij.y + psA->rij2.z * rpij.z;
+		        diff            = fabs(ld2 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance );
+                if (diff >= 1.0f)
+                {
+                    float acor  = (ld2 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij2sq);
+                    float3 dr;
+                    dr.x    = psA->rij2.x * acor;
+                    dr.y    = psA->rij2.y * acor;
+                    dr.z    = psA->rij2.z * acor;
+		            xpi.x  += dr.x * psA->InvMassI;
+		            xpi.y  += dr.y * psA->InvMassI;
+		            xpi.z  += dr.z * psA->InvMassI;
+		            xpj2.x -= dr.x * invMassJ;
+		            xpj2.y -= dr.y * invMassJ;
+		            xpj2.z -= dr.z * invMassJ;
+                    converged = false;
+                }
+            }
+
+            if (atomID.w != -1)
+            {
+                rpij.x          = xpi.x - xpj3.x;
+                rpij.y          = xpi.y - xpj3.y;
+                rpij.z          = xpi.z - xpj3.z;
+		        rpsqij          = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
+		        rrpr            = psA->rij3.x * rpij.x + psA->rij3.y * rpij.y + psA->rij3.z * rpij.z;
+		        diff            = fabs(ld3 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance );
+                if (diff >= 1.0f)
+                {
+                    float acor  = (ld3 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij3sq);
+                    float3 dr;
+                    dr.x    = psA->rij3.x * acor;
+                    dr.y    = psA->rij3.y * acor;
+                    dr.z    = psA->rij3.z * acor;
+		            xpi.x  += dr.x * psA->InvMassI;
+		            xpi.y  += dr.y * psA->InvMassI;
+		            xpi.z  += dr.z * psA->InvMassI;
+		            xpj3.x -= dr.x * invMassJ;
+		            xpj3.y -= dr.y * invMassJ;
+		            xpj3.z -= dr.z * invMassJ;
+                    converged = false;
+                }
+            }
+            iteration++;
+        }
+
+        xpi.x += xi.x;
+        xpi.y += xi.y;
+        xpi.z += xi.z;
+        xpj1.x += xj1.x;
+        xpj1.y += xj1.y;
+        xpj1.z += xj1.z;
+        xpj2.x += xj2.x;
+        xpj2.y += xj2.y;
+        xpj2.z += xj2.z;
+        xpj3.x += xj3.x;
+        xpj3.y += xj3.y;
+        xpj3.z += xj3.z;
+
+        cSim.pPosq[atomID.x] = xpi;
+        cSim.pPosq[atomID.y] = xpj1;
+
+        if (atomID.z != -1)
+            cSim.pPosq[atomID.z] = xpj2;
+
+        if (atomID.w != -1)
+            cSim.pPosq[atomID.w] = xpj3;
+
+        pos += blockDim.x * gridDim.x;
+    }
+}
+
+__global__ void kApplyNoShake_kernel()
+{
+    unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
+    while (pos < cSim.NonShakeConstraints)
+    {
+        int  atomID          = cSim.pNonShakeID[pos];
+        float4 apos          = cSim.pOldPosq[atomID];
+        float4 xpi           = cSim.pPosq[atomID];
+        xpi.x               += apos.x;
+        xpi.y               += apos.y;
+        xpi.z               += apos.z;
+        cSim.pPosq[atomID]   = xpi;
+
+        pos += blockDim.x * gridDim.x;
+    }
+}
+
+
+void kApplySecondShake(gpuContext gpu)
+{
+  //  printf("kApplySecondShake\n");
+    if (gpu->sim.ShakeConstraints > 0)
+    {
+        kApplySecondShake_kernel<<<gpu->sim.blocks, gpu->sim.shake_threads_per_block>>>();
+        LAUNCHERROR("kApplySecondShake");
+    }
+
+    // handle non-Shake atoms
+
+    if (gpu->sim.NonShakeConstraints > 0)
+    {
+        kApplyNoShake_kernel<<<gpu->sim.blocks, gpu->sim.nonshake_threads_per_block>>>();
+        LAUNCHERROR("kApplyNoShake");
+    }
+}
+
--- a/platforms/cuda/src/kernels/kVerletUpdate.cu
+++ b/platforms/cuda/src/kernels/kVerletUpdate.cu
@@ -35,8 +35,6 @@ using namespace std;

 #include "gputypes.h"

-#define DeltaShake
-
 static __constant__ cudaGmxSimulation cSim;

 void SetVerletUpdateSim(gpuContext gpu)
@@ -53,102 +51,11 @@ void GetVerletUpdateSim(gpuContext gpu)
    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
 }

-__global__ void kVerletUpdatePart1_kernel()
-{
-    unsigned int pos    = threadIdx.x + blockIdx.x * blockDim.x;
-    
-    while (pos < cSim.atoms)
-    {
-        float4 apos             = cSim.pPosq[pos];
-        float4 velocity         = cSim.pVelm4[pos];
-        float4 force            = cSim.pForce4[pos];
-        float dtOverMass        = cSim.deltaT*velocity.w;
-
-        cSim.pOldPosq[pos]      = apos;        
-        velocity.x             += dtOverMass*force.x;
-        velocity.y             += dtOverMass*force.y;
-        velocity.z             += dtOverMass*force.z;
-
-#ifndef DeltaShake
-        apos.x                 += velocity.x*cSim.deltaT;
-        apos.y                 += velocity.y*cSim.deltaT;
-        apos.z                 += velocity.z*cSim.deltaT;
-#else
-        apos.x                  = velocity.x*cSim.deltaT;
-        apos.y                  = velocity.y*cSim.deltaT;
-        apos.z                  = velocity.z*cSim.deltaT;
-#endif
-        cSim.pPosqP[pos]        = apos;
-        cSim.pVelm4[pos]        = velocity;        
-        pos                    += blockDim.x * gridDim.x;
-    }
-}
-
-__global__ void kVerletUpdatePart1CM_kernel()
-{
-    extern __shared__ float3 sCM[];
-    unsigned int pos    = threadIdx.x + blockIdx.x * blockDim.x;
-    float3 CM           = { 0.0f, 0.0f, 0.0f};
-    float4 CM1          = { 0.0f, 0.0f, 0.0f, 0.0f };
-    
-    // Read CM outputs from previous step
-    unsigned int cpos = threadIdx.x;
-    while (cpos < gridDim.x)
-    {
-        CM1             = cSim.pLinearMomentum[cpos];
-        CM.x           += CM1.x;
-        CM.y           += CM1.y;
-        CM.z           += CM1.z;
-        cpos           += blockDim.x;
-    }
-    sCM[threadIdx.x].x  = CM.x;
-    sCM[threadIdx.x].y  = CM.y;
-    sCM[threadIdx.x].z  = CM.z;
-    __syncthreads();
-    
-    // Reduce CM
-    unsigned int offset = 1;
-    unsigned int mask   = 1;
-    while (offset < blockDim.x)
-    {
-        if (((threadIdx.x & mask) == 0) && (threadIdx.x + offset < blockDim.x))
-        {
-            sCM[threadIdx.x].x += sCM[threadIdx.x + offset].x;
-            sCM[threadIdx.x].y += sCM[threadIdx.x + offset].y;
-            sCM[threadIdx.x].z += sCM[threadIdx.x + offset].z;
-        }
-        mask = 2 * mask + 1;
-        offset *= 2;
-        __syncthreads();
-    }       
-    
-    while (pos < cSim.atoms)
-    {
-        float4 apos             = cSim.pPosq[pos];
-        float4 velocity         = cSim.pVelm4[pos];
-        float4 force            = cSim.pForce4[pos];
-        float dtOverMass        = cSim.deltaT*velocity.w;
-
-        cSim.pOldPosq[pos]      = apos;        
-        velocity.x             += dtOverMass*force.x-sCM[0].x;
-        velocity.y             += dtOverMass*force.y-sCM[0].y;
-        velocity.z             += dtOverMass*force.z-sCM[0].z;
+// Include versions of the kernels with and with center of mass motion removal.

-#ifndef DeltaShake
-        apos.x                 += velocity.x*cSim.deltaT;
-        apos.y                 += velocity.y*cSim.deltaT;
-        apos.z                 += velocity.z*cSim.deltaT;
-#else
-        apos.x                  = velocity.x*cSim.deltaT;
-        apos.y                  = velocity.y*cSim.deltaT;
-        apos.z                  = velocity.z*cSim.deltaT;
-#endif
-
-        cSim.pPosqP[pos]        = apos;
-        cSim.pVelm4[pos]        = velocity;        
-        pos                    += blockDim.x * gridDim.x;
-    }
-}
+#include "kVerletUpdate.h"
+#define REMOVE_CM
+#include "kVerletUpdate.h"

 void kVerletUpdatePart1(gpuContext gpu)
 {
@@ -166,105 +73,6 @@ void kVerletUpdatePart1(gpuContext gpu)
    }
 }

-__global__ void kVerletUpdatePart2_kernel()
-{
-    unsigned int pos            = threadIdx.x + blockIdx.x * blockDim.x;
-    
-    while (pos < cSim.atoms)
-    {
-        float4 velocity         = cSim.pVelm4[pos];
-        float4 apos             = cSim.pPosq[pos];
-        float4 xPrime           = cSim.pPosqP[pos];
-
-#ifndef DeltaShake
-        velocity.x              = cSim.oneOverDeltaT*(xPrime.x-apos.x);
-        velocity.y              = cSim.oneOverDeltaT*(xPrime.y-apos.y);
-        velocity.z              = cSim.oneOverDeltaT*(xPrime.z-apos.z);
-#else
-        velocity.x              = cSim.oneOverDeltaT*(xPrime.x);
-        velocity.y              = cSim.oneOverDeltaT*(xPrime.y);
-        velocity.z              = cSim.oneOverDeltaT*(xPrime.z);
-
-        xPrime.x               += apos.x;
-        xPrime.y               += apos.y;
-        xPrime.z               += apos.z;
-#endif
-        cSim.pPosq[pos]         = xPrime;
-        cSim.pVelm4[pos]        = velocity;
-         
-        pos                    += blockDim.x * gridDim.x;    
-    }
-}
-
-__global__ void kVerletUpdatePart2CM_kernel()
-{
-    extern __shared__ float3 sCM[];
-    unsigned int pos            = threadIdx.x + blockIdx.x * blockDim.x;
-    float3 CM                   = {0.0f, 0.0f, 0.0f};
-    
-    while (pos < cSim.atoms)
-    {
-        float4 velocity         = cSim.pVelm4[pos];
-        float4 apos             = cSim.pPosq[pos];
-        float4 xPrime           = cSim.pPosqP[pos];
-        float mass              = 1.0f / velocity.w;
-
-#ifndef DeltaShake
-        velocity.x              = cSim.oneOverDeltaT*(xPrime.x-apos.x);
-        velocity.y              = cSim.oneOverDeltaT*(xPrime.y-apos.y);
-        velocity.z              = cSim.oneOverDeltaT*(xPrime.z-apos.z);
-#else
-        velocity.x              = cSim.oneOverDeltaT*(xPrime.x);
-        velocity.y              = cSim.oneOverDeltaT*(xPrime.y);
-        velocity.z              = cSim.oneOverDeltaT*(xPrime.z);
-
-        xPrime.x               += apos.x;
-        xPrime.y               += apos.y;
-        xPrime.z               += apos.z;
-#endif
-
-        CM.x                   += mass * velocity.x;
-        CM.y                   += mass * velocity.y;
-        CM.z                   += mass * velocity.z;
-        cSim.pPosq[pos]         = xPrime;
-        cSim.pVelm4[pos]        = velocity;
-         
-        pos                    += blockDim.x * gridDim.x;    
-    }
-    
-    // Scale CM
-    CM.x *= cSim.inverseTotalMass;
-    CM.y *= cSim.inverseTotalMass;
-    CM.z *= cSim.inverseTotalMass;
-    sCM[threadIdx.x] = CM;
-    __syncthreads();
-    
-    // Reduce CM for CTA
-    unsigned int offset = 1;
-    unsigned int mask   = 1;
-    while (offset < blockDim.x)
-    {
-        if (((threadIdx.x & mask) == 0) && (threadIdx.x + offset < blockDim.x))
-        {
-            sCM[threadIdx.x].x += sCM[threadIdx.x + offset].x;
-            sCM[threadIdx.x].y += sCM[threadIdx.x + offset].y;
-            sCM[threadIdx.x].z += sCM[threadIdx.x + offset].z;
-        }
-        mask = 2 * mask + 1;
-        offset *= 2;
-        __syncthreads();
-    }
-    if (threadIdx.x == 0)
-    {
-        float4 CM;
-        CM.x                                = sCM[0].x;
-        CM.y                                = sCM[0].y;
-        CM.z                                = sCM[0].z;
-        CM.w                                = 0.0f;
-        cSim.pLinearMomentum[blockIdx.x]    = CM;
-    }  
-}
-
 void kVerletUpdatePart2(gpuContext gpu)
 {
 //    printf("kVerletUpdatePart2\n");

--- a/platforms/cuda/src/kernels/kVerletUpdate.h
+++ b/platforms/cuda/src/kernels/kVerletUpdate.h
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+/**
+ * This file contains the kernels for Verlet integration.  It is included
+ * several times in kVerletUpdate.cu with different #defines to generate
+ * different versions of the kernels.
+ */
+
+#ifdef REMOVE_CM
+__global__ void kVerletUpdatePart1CM_kernel()
+#else
+__global__ void kVerletUpdatePart1_kernel()
+#endif
+{
+    unsigned int pos    = threadIdx.x + blockIdx.x * blockDim.x;
+#ifdef REMOVE_CM
+    extern __shared__ float3 sCM[];
+    float3 CM           = { 0.0f, 0.0f, 0.0f};
+    float4 CM1          = { 0.0f, 0.0f, 0.0f, 0.0f };
+
+    // Read CM outputs from previous step
+    unsigned int cpos = threadIdx.x;
+    while (cpos < gridDim.x)
+    {
+        CM1             = cSim.pLinearMomentum[cpos];
+        CM.x           += CM1.x;
+        CM.y           += CM1.y;
+        CM.z           += CM1.z;
+        cpos           += blockDim.x;
+    }
+    sCM[threadIdx.x].x  = CM.x;
+    sCM[threadIdx.x].y  = CM.y;
+    sCM[threadIdx.x].z  = CM.z;
+    __syncthreads();
+
+    // Reduce CM
+    unsigned int offset = 1;
+    unsigned int mask   = 1;
+    while (offset < blockDim.x)
+    {
+        if (((threadIdx.x & mask) == 0) && (threadIdx.x + offset < blockDim.x))
+        {
+            sCM[threadIdx.x].x += sCM[threadIdx.x + offset].x;
+            sCM[threadIdx.x].y += sCM[threadIdx.x + offset].y;
+            sCM[threadIdx.x].z += sCM[threadIdx.x + offset].z;
+        }
+        mask = 2 * mask + 1;
+        offset *= 2;
+        __syncthreads();
+    }
+#endif
+    while (pos < cSim.atoms)
+    {
+        float4 apos             = cSim.pPosq[pos];
+        float4 velocity         = cSim.pVelm4[pos];
+        float4 force            = cSim.pForce4[pos];
+        float dtOverMass        = cSim.deltaT*velocity.w;
+
+        cSim.pOldPosq[pos]      = apos;
+        velocity.x             += dtOverMass*force.x;
+        velocity.y             += dtOverMass*force.y;
+        velocity.z             += dtOverMass*force.z;
+#ifdef REMOVE_CM
+        velocity.x             -= sCM[0].x;
+        velocity.y             -= sCM[0].y;
+        velocity.z             -= sCM[0].z;
+#endif
+
+        apos.x                  = velocity.x*cSim.deltaT;
+        apos.y                  = velocity.y*cSim.deltaT;
+        apos.z                  = velocity.z*cSim.deltaT;
+
+        cSim.pPosqP[pos]        = apos;
+        cSim.pVelm4[pos]        = velocity;
+        pos                    += blockDim.x * gridDim.x;
+    }
+}
+
+#ifdef REMOVE_CM
+__global__ void kVerletUpdatePart2CM_kernel()
+#else
+__global__ void kVerletUpdatePart2_kernel()
+#endif
+{
+    unsigned int pos            = threadIdx.x + blockIdx.x * blockDim.x;
+#ifdef REMOVE_CM
+    extern __shared__ float3 sCM[];
+    float3 CM                   = {0.0f, 0.0f, 0.0f};
+#endif
+
+    while (pos < cSim.atoms)
+    {
+        float4 velocity         = cSim.pVelm4[pos];
+        float4 apos             = cSim.pPosq[pos];
+        float4 xPrime           = cSim.pPosqP[pos];
+
+        velocity.x              = cSim.oneOverDeltaT*(xPrime.x);
+        velocity.y              = cSim.oneOverDeltaT*(xPrime.y);
+        velocity.z              = cSim.oneOverDeltaT*(xPrime.z);
+
+        xPrime.x               += apos.x;
+        xPrime.y               += apos.y;
+        xPrime.z               += apos.z;
+
+#ifdef REMOVE_CM
+        float mass              = 1.0f / velocity.w;
+        CM.x                   += mass * velocity.x;
+        CM.y                   += mass * velocity.y;
+        CM.z                   += mass * velocity.z;
+#endif
+        cSim.pPosq[pos]         = xPrime;
+        cSim.pVelm4[pos]        = velocity;
+
+        pos                    += blockDim.x * gridDim.x;
+    }
+
+#ifdef REMOVE_CM
+    // Scale CM
+    CM.x *= cSim.inverseTotalMass;
+    CM.y *= cSim.inverseTotalMass;
+    CM.z *= cSim.inverseTotalMass;
+    sCM[threadIdx.x] = CM;
+    __syncthreads();
+
+    // Reduce CM for CTA
+    unsigned int offset = 1;
+    unsigned int mask   = 1;
+    while (offset < blockDim.x)
+    {
+        if (((threadIdx.x & mask) == 0) && (threadIdx.x + offset < blockDim.x))
+        {
+            sCM[threadIdx.x].x += sCM[threadIdx.x + offset].x;
+            sCM[threadIdx.x].y += sCM[threadIdx.x + offset].y;
+            sCM[threadIdx.x].z += sCM[threadIdx.x + offset].z;
+        }
+        mask = 2 * mask + 1;
+        offset *= 2;
+        __syncthreads();
+    }
+    if (threadIdx.x == 0)
+    {
+        float4 CM;
+        CM.x                                = sCM[0].x;
+        CM.y                                = sCM[0].y;
+        CM.z                                = sCM[0].z;
+        CM.w                                = 0.0f;
+        cSim.pLinearMomentum[blockIdx.x]    = CM;
+    }
+#endif
+}