Various cleanups to integration code

3749e84b · Peter Eastman · 1ef6e1d8 · 3749e84b · 3749e84b · 3749e84b
Commit 3749e84b authored Jun 24, 2009 by Peter Eastman
11 changed files
--- a/openmmapi/include/openmm/VerletIntegrator.h
+++ b/openmmapi/include/openmm/VerletIntegrator.h
@@ -39,7 +39,7 @@
 namespace OpenMM {
 /**
- * This is an Integrator which simulates a System using the velocity Verlet algorithm.
+ * This is an Integrator which simulates a System using the leap-frog Verlet algorithm.
 */
 class OPENMM_EXPORT VerletIntegrator : public Integrator {

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -488,7 +488,7 @@ void CudaIntegrateLangevinStepKernel::execute(OpenMMContextImpl& context, const
        prevFriction = friction;
        prevStepSize = stepSize;
    }
-    kUpdatePart1(gpu);
+    kLangevinUpdatePart1(gpu);
    kApplyFirstShake(gpu);
    kApplyFirstSettle(gpu);
    kApplyFirstCCMA(gpu);
@@ -497,7 +497,7 @@ void CudaIntegrateLangevinStepKernel::execute(OpenMMContextImpl& context, const
        if (step%data.cmMotionFrequency == 0)
            gpu->bCalculateCM = true;
    }
-    kUpdatePart2(gpu);
+    kLangevinUpdatePart2(gpu);
    kApplySecondShake(gpu);
    kApplySecondSettle(gpu);
    kApplySecondCCMA(gpu);

--- a/platforms/cuda/src/kernels/cudaKernels.h
+++ b/platforms/cuda/src/kernels/cudaKernels.h
@@ -40,14 +40,14 @@ extern void kCalculateObcGbsaForces2(gpuContext gpu);
 extern void kCalculateLocalForces(gpuContext gpu);
 extern void kCalculateAndersenThermostat(gpuContext gpu);
 extern void kReduceBornSumAndForces(gpuContext gpu);
-extern void kUpdatePart1(gpuContext gpu);
 extern void kApplyFirstShake(gpuContext gpu);
 extern void kApplyFirstCCMA(gpuContext gpu);
 extern void kApplyFirstSettle(gpuContext gpu);
-extern void kUpdatePart2(gpuContext gpu);
 extern void kApplySecondShake(gpuContext gpu);
 extern void kApplySecondCCMA(gpuContext gpu);
 extern void kApplySecondSettle(gpuContext gpu);
+extern void kLangevinUpdatePart1(gpuContext gpu);
+extern void kLangevinUpdatePart2(gpuContext gpu);
 extern void kVerletUpdatePart1(gpuContext gpu);
 extern void kVerletUpdatePart2(gpuContext gpu);
 extern void kBrownianUpdatePart1(gpuContext gpu);
@@ -72,8 +72,10 @@ extern void SetCalculateAndersenThermostatSim(gpuContext gpu);
 extern void GetCalculateAndersenThermostatSim(gpuContext gpu);
 extern void SetForcesSim(gpuContext gpu);
 extern void GetForcesSim(gpuContext gpu);
-extern void SetUpdateShakeHSim(gpuContext gpu);
+extern void SetShakeHSim(gpuContext gpu);
-extern void GetUpdateShakeHSim(gpuContext gpu);
+extern void GetShakeHSim(gpuContext gpu);
+extern void SetLangevinUpdateSim(gpuContext gpu);
+extern void GetLangevinUpdateSim(gpuContext gpu);
 extern void SetSettleSim(gpuContext gpu);
 extern void GetSettleSim(gpuContext gpu);
 extern void SetCCMASim(gpuContext gpu);

--- a/platforms/cuda/src/kernels/gpu.cpp
+++ b/platforms/cuda/src/kernels/gpu.cpp
@@ -1757,7 +1757,8 @@ int gpuSetConstants(gpuContext gpu)
    SetCalculateObcGbsaForces2Sim(gpu);
    SetCalculateAndersenThermostatSim(gpu);
    SetForcesSim(gpu);
-    SetUpdateShakeHSim(gpu);
+    SetShakeHSim(gpu);
+    SetLangevinUpdateSim(gpu);
    SetVerletUpdateSim(gpu);
    SetBrownianUpdateSim(gpu);
    SetSettleSim(gpu);

--- a/platforms/cuda/src/kernels/kBrownianUpdate.cu
+++ b/platforms/cuda/src/kernels/kBrownianUpdate.cu
@@ -35,8 +35,6 @@ using namespace std;
 #include "gputypes.h"
-#define DeltaShake
 static __constant__ cudaGmxSimulation cSim;
 void SetBrownianUpdateSim(gpuContext gpu)
@@ -66,15 +64,9 @@ __global__ void kBrownianUpdatePart1_kernel()
        float4 force            = cSim.pForce4[pos];
        cSim.pOldPosq[pos]      = apos;
-#ifndef DeltaShake
-        apos.x                 += force.x*cSim.GDT + random4a.x;
-        apos.y                 += force.y*cSim.GDT + random4a.y;
-        apos.z                 += force.z*cSim.GDT + random4a.z;
-#else
        apos.x                  = force.x*cSim.GDT + random4a.x;
        apos.y                  = force.y*cSim.GDT + random4a.y;
        apos.z                  = force.z*cSim.GDT + random4a.z;
-#endif
        cSim.pPosqP[pos]        = apos;
        pos                    += blockDim.x * gridDim.x;
    }
@@ -99,11 +91,6 @@ __global__ void kBrownianUpdatePart2_kernel()
        float4 apos             = cSim.pPosq[pos];
        float4 xPrime           = cSim.pPosqP[pos];
-#ifndef DeltaShake
-        velocity.x              = cSim.oneOverDeltaT*(xPrime.x-apos.x);
-        velocity.y              = cSim.oneOverDeltaT*(xPrime.y-apos.y);
-        velocity.z              = cSim.oneOverDeltaT*(xPrime.z-apos.z);
-#else
        velocity.x              = cSim.oneOverDeltaT*(xPrime.x);
        velocity.y              = cSim.oneOverDeltaT*(xPrime.y);
        velocity.z              = cSim.oneOverDeltaT*(xPrime.z);
@@ -111,7 +98,7 @@ __global__ void kBrownianUpdatePart2_kernel()
        xPrime.x               += apos.x;
        xPrime.y               += apos.y;
        xPrime.z               += apos.z;
-#endif
        cSim.pPosq[pos]         = xPrime;
        cSim.pVelm4[pos]        = velocity;

--- a/platforms/cuda/src/kernels/kLangevinUpdate.cu
+++ b/platforms/cuda/src/kernels/kLangevinUpdate.cu
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include <stdio.h>
+#include <cuda.h>
+#include <vector_functions.h>
+#include <cstdlib>
+#include <string>
+#include <iostream>
+using namespace std;
+#include "gputypes.h"
+static __constant__ cudaGmxSimulation cSim;
+void SetLangevinUpdateSim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
+}
+void GetLangevinUpdateSim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
+}
+// Include versions of the kernels with and with center of mass motion removal.
+#include "kLangevinUpdate.h"
+#define REMOVE_CM
+#include "kLangevinUpdate.h"
+void kLangevinUpdatePart1(gpuContext gpu)
+{
+//    printf("kLangevinUpdatePart1\n");
+    if (gpu->bRemoveCM)
+    {
+        kLangevinUpdatePart1CM_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block, gpu->sim.update_threads_per_block * sizeof(float3)>>>();
+        LAUNCHERROR("kLangevinUpdatePart1CM");
+        gpu->bRemoveCM = false;
+    }
+    else
+    {    
+        kLangevinUpdatePart1_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
+        LAUNCHERROR("kLangevinUpdatePart1");
+    }
+}
+extern void kGenerateRandoms(gpuContext gpu);
+void kLangevinUpdatePart2(gpuContext gpu)
+{
+//    printf("kLangevinUpdatePart2\n");
+    if (gpu->bCalculateCM)
+    {
+        kLangevinUpdatePart2CM_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block, gpu->sim.update_threads_per_block * sizeof(float3)>>>();
+        LAUNCHERROR("kLangevinUpdatePart2CM");
+        gpu->bCalculateCM = false;
+        gpu->bRemoveCM = true;
+    }
+    else
+    {
+        kLangevinUpdatePart2_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
+        LAUNCHERROR("kLangevinUpdatePart2");
+    }
+    // Update randoms if necessary
+    gpu->iterations++;
+    if (gpu->iterations == gpu->sim.randomIterations)
+    {
+        kGenerateRandoms(gpu);
+        gpu->iterations = 0;
+    }
+}
--- a/platforms/cuda/src/kernels/kLangevinUpdate.h
+++ b/platforms/cuda/src/kernels/kLangevinUpdate.h
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+/**
+ * This file contains the kernels for Langevin integration.  It is included
+ * several times in kLangevinUpdate.cu with different #defines to generate
+ * different versions of the kernels.
+ */
+#ifdef REMOVE_CM
+__global__ void kLangevinUpdatePart1CM_kernel()
+#else
+__global__ void kLangevinUpdatePart1_kernel()
+#endif
+{
+    unsigned int pos    = threadIdx.x + blockIdx.x * blockDim.x;
+    unsigned int rpos   = cSim.pRandomPosition[blockIdx.x];
+#ifdef REMOVE_CM
+    extern __shared__ float3 sCM[];
+    float3 CM           = { 0.0f, 0.0f, 0.0f};
+    float4 CM1          = { 0.0f, 0.0f, 0.0f, 0.0f };
+    // Read CM outputs from previous step
+    unsigned int cpos = threadIdx.x;
+    while (cpos < gridDim.x)
+    {
+        CM1             = cSim.pLinearMomentum[cpos];
+        CM.x           += CM1.x;
+        CM.y           += CM1.y;
+        CM.z           += CM1.z;
+        cpos           += blockDim.x;
+    }
+    sCM[threadIdx.x].x  = CM.x;
+    sCM[threadIdx.x].y  = CM.y;
+    sCM[threadIdx.x].z  = CM.z;
+    __syncthreads();
+    // Reduce CM
+    unsigned int offset = 1;
+    unsigned int mask   = 1;
+    while (offset < blockDim.x)
+    {
+        if (((threadIdx.x & mask) == 0) && (threadIdx.x + offset < blockDim.x))
+        {
+            sCM[threadIdx.x].x += sCM[threadIdx.x + offset].x;
+            sCM[threadIdx.x].y += sCM[threadIdx.x + offset].y;
+            sCM[threadIdx.x].z += sCM[threadIdx.x + offset].z;
+        }
+        mask = 2 * mask + 1;
+        offset *= 2;
+        __syncthreads();
+    }
+#endif
+    while (pos < cSim.atoms)
+    {
+        float4 velocity         = cSim.pVelm4[pos];
+        float4 xVector          = cSim.pxVector4[pos];
+        float4 random4a         = cSim.pRandom4a[rpos + pos];
+        float2 random2a         = cSim.pRandom2a[rpos + pos];
+        float4 apos             = cSim.pPosq[pos];
+        float4 force            = cSim.pForce4[pos];
+        float3 Vmh;
+        float sqrtInvMass       = sqrt(velocity.w);
+        Vmh.x                   = xVector.x * cSim.DOverTauC + sqrtInvMass * random4a.x;
+        Vmh.y                   = xVector.y * cSim.DOverTauC + sqrtInvMass * random4a.y;
+        Vmh.z                   = xVector.z * cSim.DOverTauC + sqrtInvMass * random4a.z;
+        float4 vVector;
+        vVector.x               = sqrtInvMass * random4a.w;
+        vVector.y               = sqrtInvMass * random2a.x;
+        vVector.z               = sqrtInvMass * random2a.y;
+        vVector.w               = 0.0f;
+        cSim.pvVector4[pos]     = vVector;
+        velocity.x              = velocity.x * cSim.EM +
+                                  velocity.w * force.x * cSim.TauOneMinusEM +
+                                  vVector.x -
+                                  cSim.EM * Vmh.x;
+        velocity.y              = velocity.y * cSim.EM +
+                                  velocity.w * force.y * cSim.TauOneMinusEM +
+                                  vVector.y -
+                                  cSim.EM * Vmh.y;
+        velocity.z              = velocity.z * cSim.EM +
+                                  velocity.w * force.z * cSim.TauOneMinusEM +
+                                  vVector.z -
+                                  cSim.EM * Vmh.z;
+#ifdef REMOVE_CM
+        velocity.x             -= sCM[0].x;
+        velocity.y             -= sCM[0].y;
+        velocity.z             -= sCM[0].z;
+#endif
+        cSim.pOldPosq[pos]      = apos;
+        apos.x                  = velocity.x * cSim.fix1;
+        apos.y                  = velocity.y * cSim.fix1;
+        apos.z                  = velocity.z * cSim.fix1;
+        cSim.pPosqP[pos]        = apos;
+        cSim.pVelm4[pos]        = velocity;
+        pos                    += blockDim.x * gridDim.x;
+    }
+}
+#ifdef REMOVE_CM
+__global__ void kLangevinUpdatePart2CM_kernel()
+#else
+__global__ void kLangevinUpdatePart2_kernel()
+#endif
+{
+    unsigned int pos            = threadIdx.x + blockIdx.x * blockDim.x;
+    unsigned int rpos           = cSim.pRandomPosition[blockIdx.x];
+#ifdef REMOVE_CM
+    extern __shared__ float3 sCM[];
+    float3 CM                   = {0.0f, 0.0f, 0.0f};
+    __syncthreads();
+#endif
+    while (pos < cSim.atoms)
+    {
+        float4 velocity         = cSim.pVelm4[pos];
+        float4 xPrime           = cSim.pPosqP[pos];
+        float4 vVector          = cSim.pvVector4[pos];
+        float4 xVector;
+        float4 random4b         = cSim.pRandom4b[rpos + pos];
+        float2 random2b         = cSim.pRandom2b[rpos + pos];
+        float3 Xmh;
+        float sqrtInvMass       = sqrt(velocity.w);
+        velocity.x              = xPrime.x * cSim.oneOverFix1;
+        velocity.y              = xPrime.y * cSim.oneOverFix1;
+        velocity.z              = xPrime.z * cSim.oneOverFix1;
+#ifdef REMOVE_CM
+        float mass              = 1.0f / velocity.w;
+        CM.x                   += mass * velocity.x;
+        CM.y                   += mass * velocity.y;
+        CM.z                   += mass * velocity.z;
+#endif;
+        Xmh.x                   = vVector.x * cSim.TauDOverEMMinusOne +
+                                  sqrtInvMass * random4b.x;
+        Xmh.y                   = vVector.y * cSim.TauDOverEMMinusOne +
+                                  sqrtInvMass * random4b.y;
+        Xmh.z                   = vVector.z * cSim.TauDOverEMMinusOne +
+                                  sqrtInvMass * random4b.z;
+        xVector.x               = sqrtInvMass * random4b.w;
+        xVector.y               = sqrtInvMass * random2b.x;
+        xVector.z               = sqrtInvMass * random2b.y;
+        xPrime.x               += xVector.x - Xmh.x;
+        xPrime.y               += xVector.y - Xmh.y;
+        xPrime.z               += xVector.z - Xmh.z;
+        cSim.pPosq[pos]         = xPrime;
+        cSim.pVelm4[pos]        = velocity;
+        cSim.pxVector4[pos]     = xVector;
+        pos                    += blockDim.x * gridDim.x;
+    }
+    // Update random position pointer
+    if (threadIdx.x == 0)
+    {
+        rpos                   += cSim.paddedNumberOfAtoms;
+        if (rpos > cSim.randoms)
+            rpos               -= cSim.randoms;
+        cSim.pRandomPosition[blockIdx.x] = rpos;
+    }
+#ifdef REMOVE_CM
+    // Scale CM
+    CM.x *= cSim.inverseTotalMass;
+    CM.y *= cSim.inverseTotalMass;
+    CM.z *= cSim.inverseTotalMass;
+    sCM[threadIdx.x] = CM;
+    __syncthreads();
+    // Reduce CM for CTA
+    unsigned int offset = 1;
+    unsigned int mask   = 1;
+    while (offset < blockDim.x)
+    {
+        if (((threadIdx.x & mask) == 0) && (threadIdx.x + offset < blockDim.x))
+        {
+            sCM[threadIdx.x].x += sCM[threadIdx.x + offset].x;
+            sCM[threadIdx.x].y += sCM[threadIdx.x + offset].y;
+            sCM[threadIdx.x].z += sCM[threadIdx.x + offset].z;
+        }
+        mask = 2 * mask + 1;
+        offset *= 2;
+        __syncthreads();
+    }
+    if (threadIdx.x == 0)
+    {
+        float4 CM;
+        CM.x                                = sCM[0].x;
+        CM.y                                = sCM[0].y;
+        CM.z                                = sCM[0].z;
+        CM.w                                = 0.0f;
+        cSim.pLinearMomentum[blockIdx.x]    = CM;
+    }
+#endif
+}
--- a/platforms/cuda/src/kernels/kSettle.cu
+++ b/platforms/cuda/src/kernels/kSettle.cu
@@ -33,8 +33,6 @@
 //#include <fstream>
 using namespace std;
-#define DeltaShake
 #include "gputypes.h"

--- a/platforms/cuda/src/kernels/kUpdateShakeH.cu
+++ b/platforms/cuda/src/kernels/kUpdateShakeH.cu
 /* -------------------------------------------------------------------------- *
 *                                   OpenMM                                   *
 * -------------------------------------------------------------------------- *
 * This is part of the OpenMM molecular simulation toolkit originating from   *
 * Simbios, the NIH National Center for Physics-Based Simulation of           *
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
 * Portions copyright (c) 2009 Stanford University and the Authors.           *
 * Authors: Scott Le Grand, Peter Eastman                                     *
 * Contributors:                                                              *
 *                                                                            *
 * This program is free software: you can redistribute it and/or modify       *
 * it under the terms of the GNU Lesser General Public License as published   *
 * by the Free Software Foundation, either version 3 of the License, or       *
 * (at your option) any later version.                                        *
 *                                                                            *
 * This program is distributed in the hope that it will be useful,            *
 * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
 * GNU Lesser General Public License for more details.                        *
 *                                                                            *
 * You should have received a copy of the GNU Lesser General Public License   *
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
 * -------------------------------------------------------------------------- */
 #include <stdio.h>
 #include <cuda.h>
 #include <vector_functions.h>
 #include <cstdlib>
 #include <string>
 #include <iostream>
-//#include <fstream>
+using namespace std;
-using namespace std;
+#include "gputypes.h"
-#define DeltaShake
+struct Atom
-#include "gputypes.h"
+{
+    float3 rij1;
-struct Atom 
+    float3 rij2;
-{
+    float3 rij3;
-    float3 rij1;
+    float  M;
-    float3 rij2;
+    float  d2;
-    float3 rij3;
+    float  InvMassI;
-    float  M;
+    float  rij1sq;
-    float  d2;
+    float  rij2sq;
-    float  InvMassI;
+    float  rij3sq;
-    float  rij1sq;
+};
-    float  rij2sq;
-    float  rij3sq;
-};
+static __constant__ cudaGmxSimulation cSim;
+void SetShakeHSim(gpuContext gpu)
-static __constant__ cudaGmxSimulation cSim;
+{
+    cudaError_t status;
-void SetUpdateShakeHSim(gpuContext gpu)
+    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
-{
+    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
-    cudaError_t status;
+}
-    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
-    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
+void GetShakeHSim(gpuContext gpu)
-}
+{
+    cudaError_t status;
-void GetUpdateShakeHSim(gpuContext gpu)
+    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
-{
+    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
-    cudaError_t status;
+}
-    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
-    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
+__global__ void kApplyFirstShake_kernel()
-}
+{
+    __shared__ Atom sA[G8X_THREADS_PER_BLOCK];
-__global__ void kUpdatePart1_kernel()
+    Atom* psA = &sA[threadIdx.x];
-{
+    unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
-    unsigned int pos    = threadIdx.x + blockIdx.x * blockDim.x;
+    while (pos < cSim.ShakeConstraints)
-    unsigned int rpos   = cSim.pRandomPosition[blockIdx.x];
+    {
-    __syncthreads();
+        int4 atomID         = cSim.pShakeID[pos];
+        float4 params       = cSim.pShakeParameter[pos];
-    while (pos < cSim.atoms)
+        float4 apos         = cSim.pOldPosq[atomID.x];
-    {
+        float4 xpi          = cSim.pPosqP[atomID.x];
-        float4 velocity         = cSim.pVelm4[pos];
+        float4 apos1        = cSim.pOldPosq[atomID.y];
-        float4 xVector          = cSim.pxVector4[pos];
+        float4 xpj1         = cSim.pPosqP[atomID.y];
-        float4 random4a         = cSim.pRandom4a[rpos + pos];
+        float4 apos2        = {0.0f, 0.0f, 0.0f, 0.0f};
-        float2 random2a         = cSim.pRandom2a[rpos + pos];
+        float4 xpj2         = {0.0f, 0.0f, 0.0f, 0.0f};
-        float4 apos             = cSim.pPosq[pos];
+        psA->InvMassI       = params.x;
-        float4 force            = cSim.pForce4[pos];
+        psA->M              = params.y;
+        psA->d2             = params.z;
-        float3 Vmh;
+        float invMassJ      = params.w;
-        float sqrtInvMass       = sqrt(velocity.w);
+        if (atomID.z != -1)
-        Vmh.x                   = xVector.x * cSim.DOverTauC + sqrtInvMass * random4a.x;
+        {
-        Vmh.y                   = xVector.y * cSim.DOverTauC + sqrtInvMass * random4a.y;
+            apos2           = cSim.pOldPosq[atomID.z];
-        Vmh.z                   = xVector.z * cSim.DOverTauC + sqrtInvMass * random4a.z;
+            xpj2            = cSim.pPosqP[atomID.z];
-        float4 vVector;
+        }
-        vVector.x               = sqrtInvMass * random4a.w;
+        float4 apos3        = {0.0f, 0.0f, 0.0f, 0.0f};
-        vVector.y               = sqrtInvMass * random2a.x;
+        float4 xpj3         = {0.0f, 0.0f, 0.0f, 0.0f};
-        vVector.z               = sqrtInvMass * random2a.y;
+        if (atomID.w != -1)
-        vVector.w               = 0.0f;
+        {
-        cSim.pvVector4[pos]     = vVector;
+            apos3           = cSim.pOldPosq[atomID.w];
-        velocity.x              = velocity.x * cSim.EM + 
+            xpj3            = cSim.pPosqP[atomID.w];
-                                  velocity.w * force.x * cSim.TauOneMinusEM +
+        }
-                                  vVector.x -
-                                  cSim.EM * Vmh.x;
+        float3 xi, xj1, xj2, xj3;
-        velocity.y              = velocity.y * cSim.EM + 
+        xi.x                = apos.x;
-                                  velocity.w * force.y * cSim.TauOneMinusEM +
+        xi.y                = apos.y;
-                                  vVector.y -
+        xi.z                = apos.z;
-                                  cSim.EM * Vmh.y;
+        xj1.x               = apos1.x;
-        velocity.z              = velocity.z * cSim.EM + 
+        xj1.y               = apos1.y;
-                                  velocity.w * force.z * cSim.TauOneMinusEM +
+        xj1.z               = apos1.z;
-                                  vVector.z -
+        xj2.x               = apos2.x;
-                                  cSim.EM * Vmh.z;
+        xj2.y               = apos2.y;
-        cSim.pOldPosq[pos]      = apos;
+        xj2.z               = apos2.z;
-#ifndef DeltaShake
+        xj3.x               = apos3.x;
-        apos.x                 += velocity.x * cSim.fix1;
+        xj3.y               = apos3.y;
-        apos.y                 += velocity.y * cSim.fix1;
+        xj3.z               = apos3.z;
-        apos.z                 += velocity.z * cSim.fix1;
+        psA->rij1.x         = xi.x - xj1.x;
-#else
+        psA->rij1.y         = xi.y - xj1.y;
-        apos.x                  = velocity.x * cSim.fix1;
+        psA->rij1.z         = xi.z - xj1.z;
-        apos.y                  = velocity.y * cSim.fix1;
+        psA->rij2.x         = xi.x - xj2.x;
-        apos.z                  = velocity.z * cSim.fix1;
+        psA->rij2.y         = xi.y - xj2.y;
-#endif
+        psA->rij2.z         = xi.z - xj2.z;
-        cSim.pPosqP[pos]        = apos;
+        psA->rij3.x         = xi.x - xj3.x;
-        cSim.pVelm4[pos]        = velocity;        
+        psA->rij3.y         = xi.y - xj3.y;
-        pos                    += blockDim.x * gridDim.x;
+        psA->rij3.z         = xi.z - xj3.z;
-    }
+        psA->rij1sq         = psA->rij1.x * psA->rij1.x + psA->rij1.y * psA->rij1.y + psA->rij1.z * psA->rij1.z;
-}
+        psA->rij2sq         = psA->rij2.x * psA->rij2.x + psA->rij2.y * psA->rij2.y + psA->rij2.z * psA->rij2.z;
+        psA->rij3sq         = psA->rij3.x * psA->rij3.x + psA->rij3.y * psA->rij3.y + psA->rij3.z * psA->rij3.z;
-__global__ void kUpdatePart1CM_kernel()
+        float ld1           = psA->d2 - psA->rij1sq;
-{
+        float ld2           = psA->d2 - psA->rij2sq;
-    extern __shared__ float3 sCM[];
+        float ld3           = psA->d2 - psA->rij3sq;
-    unsigned int pos    = threadIdx.x + blockIdx.x * blockDim.x;
-    unsigned int rpos   = cSim.pRandomPosition[blockIdx.x];
-    float3 CM           = { 0.0f, 0.0f, 0.0f};
+        bool converged = false;
-    float4 CM1          = { 0.0f, 0.0f, 0.0f, 0.0f };
+        int iteration = 0;
+        while (iteration < 15 && !converged)
-    // Read CM outputs from previous step
+        {
-    unsigned int cpos = threadIdx.x;
+            converged = true;
-#if 0
+            float3 rpij;
-    float4 CM2          = { 0.0f, 0.0f, 0.0f, 0.0f };
+            rpij.x          = xpi.x - xpj1.x;
-    float4 CM3          = { 0.0f, 0.0f, 0.0f, 0.0f };
+            rpij.y          = xpi.y - xpj1.y;
-    float4 CM4          = { 0.0f, 0.0f, 0.0f, 0.0f };
+            rpij.z          = xpi.z - xpj1.z;
-    if (cpos < gridDim.x)
+		    float rpsqij    = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
-        CM1             = cSim.pLinearMomentum[cpos];
+		    float rrpr      = psA->rij1.x * rpij.x + psA->rij1.y * rpij.y + psA->rij1.z * rpij.z;
-    cpos               += gridDim.x;
+		    float diff      = fabs(ld1 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance);
-    if (cpos < gridDim.x)
+            if (diff >= 1.0f)
-        CM2             = cSim.pLinearMomentum[cpos];
+            {
-    cpos               += gridDim.x;
+                float acor  = (ld1 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij1sq);
-    if (cpos < gridDim.x)
+                float3 dr;
-        CM3             = cSim.pLinearMomentum[cpos];
+                dr.x    = psA->rij1.x * acor;
-    cpos               += gridDim.x;
+                dr.y    = psA->rij1.y * acor;
-    if (cpos < gridDim.x)
+                dr.z    = psA->rij1.z * acor;
-        CM4             = cSim.pLinearMomentum[cpos];
+		        xpi.x  += dr.x * psA->InvMassI;
-    sCM[threadIdx.x].x  = CM1.x + CM2.x + CM3.x + CM4.x;
+		        xpi.y  += dr.y * psA->InvMassI;
-    sCM[threadIdx.x].y  = CM1.y + CM2.y + CM3.y + CM4.y;
+		        xpi.z  += dr.z * psA->InvMassI;
-    sCM[threadIdx.x].z  = CM1.z + CM2.z + CM3.z + CM4.z;
+		        xpj1.x -= dr.x * invMassJ;
-#else
+		        xpj1.y -= dr.y * invMassJ;
-    while (cpos < gridDim.x)
+		        xpj1.z -= dr.z * invMassJ;
-    {
+                converged = false;
-        CM1             = cSim.pLinearMomentum[cpos];
+            }
-        CM.x           += CM1.x;
-        CM.y           += CM1.y;
+            if (atomID.z != -1)
-        CM.z           += CM1.z;
+            {
-        cpos           += blockDim.x;
+                rpij.x          = xpi.x - xpj2.x;
-    }
+                rpij.y          = xpi.y - xpj2.y;
-    sCM[threadIdx.x].x  = CM.x;
+                rpij.z          = xpi.z - xpj2.z;
-    sCM[threadIdx.x].y  = CM.y;
+		        rpsqij          = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
-    sCM[threadIdx.x].z  = CM.z;
+		        rrpr            = psA->rij2.x * rpij.x + psA->rij2.y * rpij.y + psA->rij2.z * rpij.z;
-#endif
+		        diff            = fabs(ld2 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance);
-    __syncthreads();
+                if (diff >= 1.0f)
+                {
-    // Reduce CM
+                    float acor  = (ld2 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij2sq);
-    unsigned int offset = 1;
+                    float3 dr;
-    unsigned int mask   = 1;
+                    dr.x    = psA->rij2.x * acor;
-    while (offset < blockDim.x)
+                    dr.y    = psA->rij2.y * acor;
-    {
+                    dr.z    = psA->rij2.z * acor;
-        if (((threadIdx.x & mask) == 0) && (threadIdx.x + offset < blockDim.x))
+		            xpi.x  += dr.x * psA->InvMassI;
-        {
+		            xpi.y  += dr.y * psA->InvMassI;
-            sCM[threadIdx.x].x += sCM[threadIdx.x + offset].x;
+		            xpi.z  += dr.z * psA->InvMassI;
-            sCM[threadIdx.x].y += sCM[threadIdx.x + offset].y;
+		            xpj2.x -= dr.x * invMassJ;
-            sCM[threadIdx.x].z += sCM[threadIdx.x + offset].z;
+		            xpj2.y -= dr.y * invMassJ;
-        }
+		            xpj2.z -= dr.z * invMassJ;
-        mask = 2 * mask + 1;
+                    converged = false;
-        offset *= 2;
+                }
-        __syncthreads();
+            }
-    }       
+            if (atomID.w != -1)
-    while (pos < cSim.atoms)
+            {
-    {
+                rpij.x          = xpi.x - xpj3.x;
-        float4 velocity         = cSim.pVelm4[pos];
+                rpij.y          = xpi.y - xpj3.y;
-        float4 xVector          = cSim.pxVector4[pos];
+                rpij.z          = xpi.z - xpj3.z;
-        float4 random4a         = cSim.pRandom4a[rpos + pos];
+		        rpsqij          = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
-        float2 random2a         = cSim.pRandom2a[rpos + pos];
+		        rrpr            = psA->rij3.x * rpij.x + psA->rij3.y * rpij.y + psA->rij3.z * rpij.z;
-        float4 apos             = cSim.pPosq[pos];
+		        diff            = fabs(ld3 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance);
-        float4 force            = cSim.pForce4[pos];
+                if (diff >= 1.0f)
+                {
-        float3 Vmh;
+                    float acor  = (ld3 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij3sq);
-        float sqrtInvMass       = sqrt(velocity.w);
+                    float3 dr;
-        Vmh.x                   = xVector.x * cSim.DOverTauC + sqrtInvMass * random4a.x;
+                    dr.x    = psA->rij3.x * acor;
-        Vmh.y                   = xVector.y * cSim.DOverTauC + sqrtInvMass * random4a.y;
+                    dr.y    = psA->rij3.y * acor;
-        Vmh.z                   = xVector.z * cSim.DOverTauC + sqrtInvMass * random4a.z;
+                    dr.z    = psA->rij3.z * acor;
-        float4 vVector;
+		            xpi.x  += dr.x * psA->InvMassI;
-        vVector.x               = sqrtInvMass * random4a.w;
+		            xpi.y  += dr.y * psA->InvMassI;
-        vVector.y               = sqrtInvMass * random2a.x;
+		            xpi.z  += dr.z * psA->InvMassI;
-        vVector.z               = sqrtInvMass * random2a.y;
+		            xpj3.x -= dr.x * invMassJ;
-        vVector.w               = 0.0f;
+		            xpj3.y -= dr.y * invMassJ;
-        cSim.pvVector4[pos]     = vVector;
+		            xpj3.z -= dr.z * invMassJ;
-        velocity.x              = velocity.x * cSim.EM + 
+                    converged = false;
-                                  velocity.w * force.x * cSim.TauOneMinusEM +
+                }
-                                  vVector.x -
+            }
-                                  cSim.EM * Vmh.x -
+            iteration++;
-                                  sCM[0].x;
+        }
-        velocity.y              = velocity.y * cSim.EM + 
+        cSim.pPosqP[atomID.x] = xpi;
-                                  velocity.w * force.y * cSim.TauOneMinusEM +
+        cSim.pPosqP[atomID.y] = xpj1;
-                                  vVector.y -
+        if (atomID.z != -1)
-                                  cSim.EM * Vmh.y -
+            cSim.pPosqP[atomID.z] = xpj2;
-                                  sCM[0].y;
+        if (atomID.w != -1)
-        velocity.z              = velocity.z * cSim.EM + 
+            cSim.pPosqP[atomID.w] = xpj3;
-                                  velocity.w * force.z * cSim.TauOneMinusEM +
-                                  vVector.z -
+        pos += blockDim.x * gridDim.x;
-                                  cSim.EM * Vmh.z -
+    }
-                                  sCM[0].z;
+}
-        cSim.pOldPosq[pos]      = apos;
-#ifndef DeltaShake
+void kApplyFirstShake(gpuContext gpu)
-        apos.x                 += velocity.x * cSim.fix1;
+{
-        apos.y                 += velocity.y * cSim.fix1;
+//    printf("kApplyFirstShake\n");
-        apos.z                 += velocity.z * cSim.fix1;
+    if (gpu->sim.ShakeConstraints > 0)
-#else
+    {
-        apos.x                  = velocity.x * cSim.fix1;
+        kApplyFirstShake_kernel<<<gpu->sim.blocks, gpu->sim.shake_threads_per_block>>>();
-        apos.y                  = velocity.y * cSim.fix1;
+        LAUNCHERROR("kApplyFirstShake");
-        apos.z                  = velocity.z * cSim.fix1;
+    }
-#endif
+}
-        cSim.pPosqP[pos]        = apos;
-        cSim.pVelm4[pos]        = velocity;        
+__global__ void kApplySecondShake_kernel()
-        pos                    += blockDim.x * gridDim.x;
+{
-    }
+    __shared__ Atom sA[G8X_THREADS_PER_BLOCK];
-}
+    Atom* psA = &sA[threadIdx.x];
+    unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
+    while (pos < cSim.ShakeConstraints)
+    {
-void kUpdatePart1(gpuContext gpu)
+        int4 atomID         = cSim.pShakeID[pos];
-{
+        float4 params       = cSim.pShakeParameter[pos];
-//    printf("kUpdatePart1\n");
+        float4 apos         = cSim.pOldPosq[atomID.x];
-#if 0
+        float4 xpi          = cSim.pPosq[atomID.x];
-    static int iteration = 0;
+        float4 apos1        = cSim.pOldPosq[atomID.y];
-    if (iteration == 0)
+        float4 xpj1         = cSim.pPosq[atomID.y];
-    {
+        float4 apos2        = {0.0f, 0.0f, 0.0f, 0.0f};
-        gpu->psPosq4->Download();
+        float4 xpj2         = {0.0f, 0.0f, 0.0f, 0.0f};
-        gpu->psVelm4->Download();
+        psA->InvMassI       = params.x;
-        printf("# %d atoms\n", gpu->natoms);
+        psA->M              = params.y;
-        for (int i = 0; i < gpu->natoms; i++)
+        psA->d2             = params.z;
-        {
+        float invMassJ      = params.w;
-            printf("%5d %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f\n", i,
+        if (atomID.z != -1)
-                gpu->psPosq4->_pSysStream[0][i].x, gpu->psPosq4->_pSysStream[0][i].y,
+        {
-                gpu->psPosq4->_pSysStream[0][i].z, gpu->psPosq4->_pSysStream[0][i].w,
+            apos2           = cSim.pOldPosq[atomID.z];
-                gpu->psVelm4->_pSysStream[0][i].x, gpu->psVelm4->_pSysStream[0][i].y,
+            xpj2            = cSim.pPosq[atomID.z];
-                gpu->psVelm4->_pSysStream[0][i].z, gpu->psVelm4->_pSysStream[0][i].w
+        }
-            );       
+        float4 apos3        = {0.0f, 0.0f, 0.0f, 0.0f};
-        }
+        float4 xpj3         = {0.0f, 0.0f, 0.0f, 0.0f};
-    }
+        if (atomID.w != -1)
-    iteration++;
+        {
-#endif
+            apos3           = cSim.pOldPosq[atomID.w];
-#if 0
+            xpj3            = cSim.pPosq[atomID.w];
-    static const float KILO 		        =    1e3;              		// Thousand
+        }
-    static const float BOLTZMANN	        =    1.380658e-23f;            // (J/K)	
-    static const float AVOGADRO	            =    6.0221367e23f;		    // ()		
+        float3 xi, xj1, xj2, xj3;
-    static const float RGAS                 =    BOLTZMANN * AVOGADRO;     // (J/(mol K))
+        xi.x                = apos.x;
-    static const float BOLTZ                =    (RGAS / KILO);            // (kJ/(mol K)) 
+        xi.y                = apos.y;
-    static int iteration = 0;
+        xi.z                = apos.z;
+        xj1.x               = apos1.x;
-    // Check T
+        xj1.y               = apos1.y;
-    if (iteration % 1000 == 0)
+        xj1.z               = apos1.z;
-    {
+        xj2.x               = apos2.x;
-        gpu->psVelm4->Download();
+        xj2.y               = apos2.y;
-        float ke = 0.0f;
+        xj2.z               = apos2.z;
-        for (int i = 0; i < gpu->natoms; i++)
+        xj3.x               = apos3.x;
-        {
+        xj3.y               = apos3.y;
-            float vx = gpu->psVelm4->_pSysStream[0][i].x;
+        xj3.z               = apos3.z;
-            float vy = gpu->psVelm4->_pSysStream[0][i].y;
+        psA->rij1.x         = xi.x - xj1.x;
-            float vz = gpu->psVelm4->_pSysStream[0][i].z;
+        psA->rij1.y         = xi.y - xj1.y;
-            float m = 1.0f / gpu->psVelm4->_pSysStream[0][i].w;
+        psA->rij1.z         = xi.z - xj1.z;
-            ke += m * (vx * vx + vy * vy + vz * vz);
+        psA->rij2.x         = xi.x - xj2.x;
-        }
+        psA->rij2.y         = xi.y - xj2.y;
-        float T = ke / (BOLTZ  * gpu->sim.degreesOfFreedom);
+        psA->rij2.z         = xi.z - xj2.z;
-        printf("Iteration %d, Temperature is %f\n", iteration, T);
+        psA->rij3.x         = xi.x - xj3.x;
-    }
+        psA->rij3.y         = xi.y - xj3.y;
-    iteration++;
+        psA->rij3.z         = xi.z - xj3.z;
-#endif    
+        psA->rij1sq         = psA->rij1.x * psA->rij1.x + psA->rij1.y * psA->rij1.y + psA->rij1.z * psA->rij1.z;
-    if (gpu->bRemoveCM)
+        psA->rij2sq         = psA->rij2.x * psA->rij2.x + psA->rij2.y * psA->rij2.y + psA->rij2.z * psA->rij2.z;
-    {
+        psA->rij3sq         = psA->rij3.x * psA->rij3.x + psA->rij3.y * psA->rij3.y + psA->rij3.z * psA->rij3.z;
-        kUpdatePart1CM_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block, gpu->sim.update_threads_per_block * sizeof(float3)>>>();
+        float ld1           = psA->d2 - psA->rij1sq;
-        LAUNCHERROR("kUpdatePart1CM");
+        float ld2           = psA->d2 - psA->rij2sq;
-        gpu->bRemoveCM = false;
+        float ld3           = psA->d2 - psA->rij3sq;
-#if 0
-        gpu->psLinearMomentum->Download();
+        bool converged = false;
-        gpu->psVelm4->Download();
+        int iteration = 0;
-        float3 mv = {0.0f, 0.0f, 0.0f};
+        while (iteration < 15 && !converged)
-        for (int i = 0; i < gpu->natoms; i++)
+        {
-        {
+            converged = true;
-            float mass = 1.0f / gpu->psVelm4->_pSysStream[0][i].w;
+            float3 rpij;
-            mv.x += mass * gpu->psVelm4->_pSysStream[0][i].x;
+            rpij.x          = xpi.x - xpj1.x;
-            mv.y += mass * gpu->psVelm4->_pSysStream[0][i].y;
+            rpij.y          = xpi.y - xpj1.y;
-            mv.z += mass * gpu->psVelm4->_pSysStream[0][i].z;
+            rpij.z          = xpi.z - xpj1.z;
-        }
+		    float rpsqij    = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
-        mv.x *= gpu->sim.inverseTotalMass;
+		    float rrpr      = psA->rij1.x * rpij.x + psA->rij1.y * rpij.y + psA->rij1.z * rpij.z;
-        mv.y *= gpu->sim.inverseTotalMass;
+		    float diff      = fabs(ld1 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance );
-        mv.z *= gpu->sim.inverseTotalMass;
+            if (diff >= 1.0f)
+            {
-        float3 mv1 = {0.0f, 0.0f, 0.0f};
+                float acor  = (ld1 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij1sq);
-        for (int i = 0; i < gpu->sim.blocks; i++)
+                float3 dr;
-        {
+                dr.x    = psA->rij1.x * acor;
-            mv1.x += gpu->psLinearMomentum->_pSysStream[0][i].x;
+                dr.y    = psA->rij1.y * acor;
-            mv1.y += gpu->psLinearMomentum->_pSysStream[0][i].y;
+                dr.z    = psA->rij1.z * acor;
-            mv1.z += gpu->psLinearMomentum->_pSysStream[0][i].z;
+		        xpi.x  += dr.x * psA->InvMassI;
-        }
+		        xpi.y  += dr.y * psA->InvMassI;
-        printf("%11.5f %11.5f %11.5f | %11.5f %11.5f %11.5f\n", mv.x, mv.y, mv.z, mv1.x, mv1.y, mv1.z);
+		        xpi.z  += dr.z * psA->InvMassI;
-#endif
+		        xpj1.x -= dr.x * invMassJ;
-    }
+		        xpj1.y -= dr.y * invMassJ;
-    else
+		        xpj1.z -= dr.z * invMassJ;
-    {    
+                converged = false;
-        kUpdatePart1_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
+            }
-        LAUNCHERROR("kUpdatePart1");
-    }
+            if (atomID.z != -1)
-}
+            {
+                rpij.x          = xpi.x - xpj2.x;
-__global__ void kApplyFirstShake_kernel()
+                rpij.y          = xpi.y - xpj2.y;
-{
+                rpij.z          = xpi.z - xpj2.z;
-    __shared__ Atom sA[G8X_THREADS_PER_BLOCK];
+		        rpsqij          = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
-    Atom* psA = &sA[threadIdx.x];
+		        rrpr            = psA->rij2.x * rpij.x + psA->rij2.y * rpij.y + psA->rij2.z * rpij.z;
-    unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
+		        diff            = fabs(ld2 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance );
-    while (pos < cSim.ShakeConstraints)
+                if (diff >= 1.0f)
-    {
+                {
-        int4 atomID         = cSim.pShakeID[pos];
+                    float acor  = (ld2 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij2sq);
-        float4 params       = cSim.pShakeParameter[pos];
+                    float3 dr;
-        float4 apos         = cSim.pOldPosq[atomID.x];
+                    dr.x    = psA->rij2.x * acor;
-        float4 xpi          = cSim.pPosqP[atomID.x];
+                    dr.y    = psA->rij2.y * acor;
-        float4 apos1        = cSim.pOldPosq[atomID.y];
+                    dr.z    = psA->rij2.z * acor;
-        float4 xpj1         = cSim.pPosqP[atomID.y];
+		            xpi.x  += dr.x * psA->InvMassI;
-        float4 apos2        = {0.0f, 0.0f, 0.0f, 0.0f};
+		            xpi.y  += dr.y * psA->InvMassI;
-        float4 xpj2         = {0.0f, 0.0f, 0.0f, 0.0f};
+		            xpi.z  += dr.z * psA->InvMassI;
-        psA->InvMassI       = params.x;
+		            xpj2.x -= dr.x * invMassJ;
-        psA->M              = params.y;
+		            xpj2.y -= dr.y * invMassJ;
-        psA->d2             = params.z;
+		            xpj2.z -= dr.z * invMassJ;
-        float invMassJ      = params.w;
+                    converged = false;
-        if (atomID.z != -1)
+                }
-        {
+            }
-            apos2           = cSim.pOldPosq[atomID.z]; 
-            xpj2            = cSim.pPosqP[atomID.z];
+            if (atomID.w != -1)
-        }   
+            {
-        float4 apos3        = {0.0f, 0.0f, 0.0f, 0.0f};
+                rpij.x          = xpi.x - xpj3.x;
-        float4 xpj3         = {0.0f, 0.0f, 0.0f, 0.0f};
+                rpij.y          = xpi.y - xpj3.y;
-        if (atomID.w != -1)
+                rpij.z          = xpi.z - xpj3.z;
-        {
+		        rpsqij          = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
-            apos3           = cSim.pOldPosq[atomID.w]; 
+		        rrpr            = psA->rij3.x * rpij.x + psA->rij3.y * rpij.y + psA->rij3.z * rpij.z;
-            xpj3            = cSim.pPosqP[atomID.w];
+		        diff            = fabs(ld3 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance );
-        }    
+                if (diff >= 1.0f)
+                {
-        float3 xi, xj1, xj2, xj3;
+                    float acor  = (ld3 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij3sq);
-        xi.x                = apos.x;
+                    float3 dr;
-        xi.y                = apos.y;
+                    dr.x    = psA->rij3.x * acor;
-        xi.z                = apos.z;
+                    dr.y    = psA->rij3.y * acor;
-        xj1.x               = apos1.x;
+                    dr.z    = psA->rij3.z * acor;
-        xj1.y               = apos1.y;
+		            xpi.x  += dr.x * psA->InvMassI;
-        xj1.z               = apos1.z;
+		            xpi.y  += dr.y * psA->InvMassI;
-        xj2.x               = apos2.x;
+		            xpi.z  += dr.z * psA->InvMassI;
-        xj2.y               = apos2.y;
+		            xpj3.x -= dr.x * invMassJ;
-        xj2.z               = apos2.z;
+		            xpj3.y -= dr.y * invMassJ;
-        xj3.x               = apos3.x;
+		            xpj3.z -= dr.z * invMassJ;
-        xj3.y               = apos3.y;
+                    converged = false;
-        xj3.z               = apos3.z;
+                }
-#ifndef DeltaShake
+            }
-        xpi.x              -= xi.x;
+            iteration++;
-        xpi.y              -= xi.y;
+        }
-        xpi.z              -= xi.z;
-        xpj1.x             -= xj1.x;
+        xpi.x += xi.x;
-        xpj1.y             -= xj1.y;
+        xpi.y += xi.y;
-        xpj1.z             -= xj1.z;
+        xpi.z += xi.z;
-        xpj2.x             -= xj2.x;
+        xpj1.x += xj1.x;
-        xpj2.y             -= xj2.y;
+        xpj1.y += xj1.y;
-        xpj2.z             -= xj2.z;
+        xpj1.z += xj1.z;
-        xpj3.x             -= xj3.x;
+        xpj2.x += xj2.x;
-        xpj3.y             -= xj3.y;
+        xpj2.y += xj2.y;
-        xpj3.z             -= xj3.z;
+        xpj2.z += xj2.z;
-#endif
+        xpj3.x += xj3.x;
-        psA->rij1.x         = xi.x - xj1.x;
+        xpj3.y += xj3.y;
-        psA->rij1.y         = xi.y - xj1.y;
+        xpj3.z += xj3.z;
-        psA->rij1.z         = xi.z - xj1.z;
-        psA->rij2.x         = xi.x - xj2.x;
+        cSim.pPosq[atomID.x] = xpi;
-        psA->rij2.y         = xi.y - xj2.y;
+        cSim.pPosq[atomID.y] = xpj1;
-        psA->rij2.z         = xi.z - xj2.z;
-        psA->rij3.x         = xi.x - xj3.x;
+        if (atomID.z != -1)
-        psA->rij3.y         = xi.y - xj3.y;
+            cSim.pPosq[atomID.z] = xpj2;
-        psA->rij3.z         = xi.z - xj3.z;
-        psA->rij1sq         = psA->rij1.x * psA->rij1.x + psA->rij1.y * psA->rij1.y + psA->rij1.z * psA->rij1.z;
+        if (atomID.w != -1)
-        psA->rij2sq         = psA->rij2.x * psA->rij2.x + psA->rij2.y * psA->rij2.y + psA->rij2.z * psA->rij2.z;
+            cSim.pPosq[atomID.w] = xpj3;
-        psA->rij3sq         = psA->rij3.x * psA->rij3.x + psA->rij3.y * psA->rij3.y + psA->rij3.z * psA->rij3.z;
-        float ld1           = psA->d2 - psA->rij1sq;
+        pos += blockDim.x * gridDim.x;
-        float ld2           = psA->d2 - psA->rij2sq;
+    }
-        float ld3           = psA->d2 - psA->rij3sq;
+}
+__global__ void kApplyNoShake_kernel()
-        bool converged = false;
+{
-        int iteration = 0;
+    unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
-        while (iteration < 15 && !converged)
+    while (pos < cSim.NonShakeConstraints)
-        {
+    {
-            converged = true;
+        int  atomID          = cSim.pNonShakeID[pos];
-            float3 rpij;
+        float4 apos          = cSim.pOldPosq[atomID];
-            rpij.x          = xpi.x - xpj1.x;
+        float4 xpi           = cSim.pPosq[atomID];
-            rpij.y          = xpi.y - xpj1.y;
+        xpi.x               += apos.x;
-            rpij.z          = xpi.z - xpj1.z;
+        xpi.y               += apos.y;
-		    float rpsqij    = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
+        xpi.z               += apos.z;
-		    float rrpr      = psA->rij1.x * rpij.x + psA->rij1.y * rpij.y + psA->rij1.z * rpij.z; 
+        cSim.pPosq[atomID]   = xpi;
-		    float diff      = fabs(ld1 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance);
-            if (diff >= 1.0f)
+        pos += blockDim.x * gridDim.x;
-            {
+    }
-                float acor  = (ld1 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij1sq);
+}
-                float3 dr;
-                dr.x    = psA->rij1.x * acor;
-                dr.y    = psA->rij1.y * acor;
+void kApplySecondShake(gpuContext gpu)
-                dr.z    = psA->rij1.z * acor;
+{
-		        xpi.x  += dr.x * psA->InvMassI;
+  //  printf("kApplySecondShake\n");
-		        xpi.y  += dr.y * psA->InvMassI;
+    if (gpu->sim.ShakeConstraints > 0)
-		        xpi.z  += dr.z * psA->InvMassI;
+    {
-		        xpj1.x -= dr.x * invMassJ;
+        kApplySecondShake_kernel<<<gpu->sim.blocks, gpu->sim.shake_threads_per_block>>>();
-		        xpj1.y -= dr.y * invMassJ;
+        LAUNCHERROR("kApplySecondShake");
-		        xpj1.z -= dr.z * invMassJ;
+    }
-                converged = false;
-            }
+    // handle non-Shake atoms
-            if (atomID.z != -1)
+    if (gpu->sim.NonShakeConstraints > 0)
-            {
+    {
-                rpij.x          = xpi.x - xpj2.x;
+        kApplyNoShake_kernel<<<gpu->sim.blocks, gpu->sim.nonshake_threads_per_block>>>();
-                rpij.y          = xpi.y - xpj2.y;
+        LAUNCHERROR("kApplyNoShake");
-                rpij.z          = xpi.z - xpj2.z;
+    }
-		        rpsqij          = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
+}
-		        rrpr            = psA->rij2.x * rpij.x + psA->rij2.y * rpij.y + psA->rij2.z * rpij.z; 
-		        diff            = fabs(ld2 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance);
-                if (diff >= 1.0f)
-                {
-                    float acor  = (ld2 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij2sq);
-                    float3 dr;
-                    dr.x    = psA->rij2.x * acor;
-                    dr.y    = psA->rij2.y * acor;
-                    dr.z    = psA->rij2.z * acor;
-		            xpi.x  += dr.x * psA->InvMassI;
-		            xpi.y  += dr.y * psA->InvMassI;
-		            xpi.z  += dr.z * psA->InvMassI;
-		            xpj2.x -= dr.x * invMassJ;
-		            xpj2.y -= dr.y * invMassJ;
-		            xpj2.z -= dr.z * invMassJ;
-                    converged = false;
-                }
-            }
-            if (atomID.w != -1)
-            {
-                rpij.x          = xpi.x - xpj3.x;
-                rpij.y          = xpi.y - xpj3.y;
-                rpij.z          = xpi.z - xpj3.z;
-		        rpsqij          = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
-		        rrpr            = psA->rij3.x * rpij.x + psA->rij3.y * rpij.y + psA->rij3.z * rpij.z; 
-		        diff            = fabs(ld3 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance);
-                if (diff >= 1.0f)
-                {
-                    float acor  = (ld3 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij3sq);
-                    float3 dr;
-                    dr.x    = psA->rij3.x * acor;
-                    dr.y    = psA->rij3.y * acor;
-                    dr.z    = psA->rij3.z * acor;
-		            xpi.x  += dr.x * psA->InvMassI;
-		            xpi.y  += dr.y * psA->InvMassI;
-		            xpi.z  += dr.z * psA->InvMassI;
-		            xpj3.x -= dr.x * invMassJ;
-		            xpj3.y -= dr.y * invMassJ;
-		            xpj3.z -= dr.z * invMassJ;
-                    converged = false;
-                }
-            }
-            iteration++;
-        }
-#ifndef DeltaShake
-        xpi.x  += xi.x;
-        xpi.y  += xi.y;
-        xpi.z  += xi.z;
-        xpj1.x += xj1.x;
-        xpj1.y += xj1.y;
-        xpj1.z += xj1.z;
-        xpj2.x += xj2.x;
-        xpj2.y += xj2.y;
-        xpj2.z += xj2.z;
-        xpj3.x += xj3.x;
-        xpj3.y += xj3.y;
-        xpj3.z += xj3.z;
-#endif
-        cSim.pPosqP[atomID.x] = xpi;
-        cSim.pPosqP[atomID.y] = xpj1;
-        if (atomID.z != -1)
-            cSim.pPosqP[atomID.z] = xpj2;
-        if (atomID.w != -1)
-            cSim.pPosqP[atomID.w] = xpj3;     
-        pos += blockDim.x * gridDim.x;
-    }
-}
-void kApplyFirstShake(gpuContext gpu)
-{
-//    printf("kApplyFirstShake\n");       
-    if (gpu->sim.ShakeConstraints > 0)
-    {    
-        kApplyFirstShake_kernel<<<gpu->sim.blocks, gpu->sim.shake_threads_per_block>>>();
-        LAUNCHERROR("kApplyFirstShake");
-    }
-}
-__global__ void kUpdatePart2_kernel()
-{
-    unsigned int pos            = threadIdx.x + blockIdx.x * blockDim.x;
-    unsigned int rpos           = cSim.pRandomPosition[blockIdx.x];
-    __syncthreads();
-    while (pos < cSim.atoms)
-    {
-        float4 velocity         = cSim.pVelm4[pos];
-#ifndef DeltaShake
-        float4 apos             = cSim.pPosq[pos];
-#endif
-        float4 xPrime           = cSim.pPosqP[pos];
-        float4 vVector          = cSim.pvVector4[pos];
-        float4 xVector;
-        float4 random4b         = cSim.pRandom4b[rpos + pos];
-        float2 random2b         = cSim.pRandom2b[rpos + pos];
-        float3 Xmh;
-        float sqrtInvMass       = sqrt(velocity.w);
-#ifdef DeltaShake
-        velocity.x              = xPrime.x * cSim.oneOverFix1;
-        velocity.y              = xPrime.y * cSim.oneOverFix1;
-        velocity.z              = xPrime.z * cSim.oneOverFix1;
-#else
-        velocity.x              = (xPrime.x - apos.x) * cSim.oneOverFix1;
-        velocity.y              = (xPrime.y - apos.y) * cSim.oneOverFix1;
-        velocity.z              = (xPrime.z - apos.z) * cSim.oneOverFix1;
-#endif
-        Xmh.x                   = vVector.x * cSim.TauDOverEMMinusOne +
-                                  sqrtInvMass * random4b.x;
-        Xmh.y                   = vVector.y * cSim.TauDOverEMMinusOne +
-                                  sqrtInvMass * random4b.y;
-        Xmh.z                   = vVector.z * cSim.TauDOverEMMinusOne +
-                                  sqrtInvMass * random4b.z;
-        xVector.x               = sqrtInvMass * random4b.w;
-        xVector.y               = sqrtInvMass * random2b.x;
-        xVector.z               = sqrtInvMass * random2b.y;                    
-        xPrime.x               += xVector.x - Xmh.x;
-        xPrime.y               += xVector.y - Xmh.y;
-        xPrime.z               += xVector.z - Xmh.z;
-        cSim.pPosq[pos]         = xPrime;
-        cSim.pVelm4[pos]        = velocity;
-        cSim.pxVector4[pos]     = xVector;
-        pos                    += blockDim.x * gridDim.x;    
-    }
-    // Update random position pointer
-    if (threadIdx.x == 0)
-    {
-        rpos                   += cSim.paddedNumberOfAtoms;
-        if (rpos > cSim.randoms)
-            rpos               -= cSim.randoms;
-        cSim.pRandomPosition[blockIdx.x] = rpos;
-    }
-}
-__global__ void kUpdatePart2CM_kernel()
-{
-    extern __shared__ float3 sCM[];
-    unsigned int pos            = threadIdx.x + blockIdx.x * blockDim.x;
-    unsigned int rpos           = cSim.pRandomPosition[blockIdx.x];
-    float3 CM                   = {0.0f, 0.0f, 0.0f};
-    __syncthreads();
-    while (pos < cSim.atoms)
-    {
-        float4 velocity         = cSim.pVelm4[pos];
-#ifndef DeltaShake
-        float4 apos             = cSim.pPosq[pos];
-#endif
-        float4 xPrime           = cSim.pPosqP[pos];
-        float4 vVector          = cSim.pvVector4[pos];
-        float4 xVector;
-        float4 random4b         = cSim.pRandom4b[rpos + pos];
-        float2 random2b         = cSim.pRandom2b[rpos + pos];
-        float3 Xmh;
-        float mass              = 1.0f / velocity.w;
-        float sqrtInvMass       = sqrt(velocity.w);
-#ifdef DeltaShake
-        velocity.x              = xPrime.x * cSim.oneOverFix1;
-        velocity.y              = xPrime.y * cSim.oneOverFix1;
-        velocity.z              = xPrime.z * cSim.oneOverFix1;
-#else
-        velocity.x              = (xPrime.x - apos.x) * cSim.oneOverFix1;
-        velocity.y              = (xPrime.y - apos.y) * cSim.oneOverFix1;
-        velocity.z              = (xPrime.z - apos.z) * cSim.oneOverFix1;
-#endif
-        CM.x                   += mass * velocity.x;
-        CM.y                   += mass * velocity.y;
-        CM.z                   += mass * velocity.z;
-        Xmh.x                   = vVector.x * cSim.TauDOverEMMinusOne +
-                                  sqrtInvMass * random4b.x;
-        Xmh.y                   = vVector.y * cSim.TauDOverEMMinusOne +
-                                  sqrtInvMass * random4b.y;
-        Xmh.z                   = vVector.z * cSim.TauDOverEMMinusOne +
-                                  sqrtInvMass * random4b.z;
-        xVector.x               = sqrtInvMass * random4b.w;
-        xVector.y               = sqrtInvMass * random2b.x;
-        xVector.z               = sqrtInvMass * random2b.y;                    
-        xPrime.x               += xVector.x - Xmh.x;
-        xPrime.y               += xVector.y - Xmh.y;
-        xPrime.z               += xVector.z - Xmh.z;
-        cSim.pPosq[pos]         = xPrime;
-        cSim.pVelm4[pos]        = velocity;
-        cSim.pxVector4[pos]     = xVector;
-        pos                    += blockDim.x * gridDim.x;    
-    }
-    // Update random position pointer
-    if (threadIdx.x == 0)
-    {
-        rpos                   += cSim.paddedNumberOfAtoms;
-        if (rpos > cSim.randoms)
-            rpos               -= cSim.randoms;
-        cSim.pRandomPosition[blockIdx.x] = rpos;
-    }
-    // Scale CM
-    CM.x *= cSim.inverseTotalMass;
-    CM.y *= cSim.inverseTotalMass;
-    CM.z *= cSim.inverseTotalMass;
-    sCM[threadIdx.x] = CM;
-    __syncthreads();
-    // Reduce CM for CTA
-    unsigned int offset = 1;
-    unsigned int mask   = 1;
-    while (offset < blockDim.x)
-    {
-        if (((threadIdx.x & mask) == 0) && (threadIdx.x + offset < blockDim.x))
-        {
-            sCM[threadIdx.x].x += sCM[threadIdx.x + offset].x;
-            sCM[threadIdx.x].y += sCM[threadIdx.x + offset].y;
-            sCM[threadIdx.x].z += sCM[threadIdx.x + offset].z;
-        }
-        mask = 2 * mask + 1;
-        offset *= 2;
-        __syncthreads();
-    }
-    if (threadIdx.x == 0)
-    {
-        float4 CM;
-        CM.x                                = sCM[0].x;
-        CM.y                                = sCM[0].y;
-        CM.z                                = sCM[0].z;
-        CM.w                                = 0.0f;
-        cSim.pLinearMomentum[blockIdx.x]    = CM;
-    }  
-}
-extern void kGenerateRandoms(gpuContext gpu);
-void kUpdatePart2(gpuContext gpu)
-{
-//    printf("kUpdatePart2\n");
-    if (gpu->bCalculateCM)
-    {
-        kUpdatePart2CM_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block, gpu->sim.update_threads_per_block * sizeof(float3)>>>();
-        LAUNCHERROR("kUpdatePart2CM");
-        gpu->bCalculateCM = false;
-        gpu->bRemoveCM = true;
-#if 0
-        gpu->psLinearMomentum->Download();
-        gpu->psVelm4->Download();
-        float3 mv = {0.0f, 0.0f, 0.0f};
-        for (int i = 0; i < gpu->natoms; i++)
-        {
-            float mass = 1.0f / gpu->psVelm4->_pSysStream[0][i].w;
-            mv.x += mass * gpu->psVelm4->_pSysStream[0][i].x;
-            mv.y += mass * gpu->psVelm4->_pSysStream[0][i].y;
-            mv.z += mass * gpu->psVelm4->_pSysStream[0][i].z;
-        }
-        mv.x *= gpu->sim.inverseTotalMass;
-        mv.y *= gpu->sim.inverseTotalMass;
-        mv.z *= gpu->sim.inverseTotalMass;
-        float3 mv1 = {0.0f, 0.0f, 0.0f};
-        for (int i = 0; i < gpu->sim.blocks; i++)
-        {
-            mv1.x += gpu->psLinearMomentum->_pSysStream[0][i].x;
-            mv1.y += gpu->psLinearMomentum->_pSysStream[0][i].y;
-            mv1.z += gpu->psLinearMomentum->_pSysStream[0][i].z;
-        }
-        printf("%11.5f %11.5f %11.5f | %11.5f %11.5f %11.5f\n", mv.x, mv.y, mv.z, mv1.x, mv1.y, mv1.z);
-#endif
-    }
-    else
-    {
-        kUpdatePart2_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
-        LAUNCHERROR("kUpdatePart2");
-    }
-    // Update randoms if necessary
-    gpu->iterations++;
-    if (gpu->iterations == gpu->sim.randomIterations)
-    {
-        kGenerateRandoms(gpu);
-        gpu->iterations = 0;
-    }
-}
-__global__ void kApplySecondShake_kernel()
-{
-    __shared__ Atom sA[G8X_THREADS_PER_BLOCK];
-    Atom* psA = &sA[threadIdx.x];
-    unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
-    while (pos < cSim.ShakeConstraints)
-    {
-        int4 atomID         = cSim.pShakeID[pos];
-        float4 params       = cSim.pShakeParameter[pos];
-        float4 apos         = cSim.pOldPosq[atomID.x];
-        float4 xpi          = cSim.pPosq[atomID.x];
-        float4 apos1        = cSim.pOldPosq[atomID.y];
-        float4 xpj1         = cSim.pPosq[atomID.y];
-        float4 apos2        = {0.0f, 0.0f, 0.0f, 0.0f};
-        float4 xpj2         = {0.0f, 0.0f, 0.0f, 0.0f};
-        psA->InvMassI       = params.x;
-        psA->M              = params.y;
-        psA->d2             = params.z;
-        float invMassJ      = params.w;
-        if (atomID.z != -1)
-        {
-            apos2           = cSim.pOldPosq[atomID.z]; 
-            xpj2            = cSim.pPosq[atomID.z];
-        }   
-        float4 apos3        = {0.0f, 0.0f, 0.0f, 0.0f};
-        float4 xpj3         = {0.0f, 0.0f, 0.0f, 0.0f};
-        if (atomID.w != -1)
-        {
-            apos3           = cSim.pOldPosq[atomID.w]; 
-            xpj3            = cSim.pPosq[atomID.w];
-        }    
-        float3 xi, xj1, xj2, xj3;
-        xi.x                = apos.x;
-        xi.y                = apos.y;
-        xi.z                = apos.z;
-        xj1.x               = apos1.x;
-        xj1.y               = apos1.y;
-        xj1.z               = apos1.z;
-        xj2.x               = apos2.x;
-        xj2.y               = apos2.y;
-        xj2.z               = apos2.z;
-        xj3.x               = apos3.x;
-        xj3.y               = apos3.y;
-        xj3.z               = apos3.z;
-#ifndef DeltaShake
-        xpi.x              -= xi.x;
-        xpi.y              -= xi.y;
-        xpi.z              -= xi.z;
-        xpj1.x             -= xj1.x;
-        xpj1.y             -= xj1.y;
-        xpj1.z             -= xj1.z;
-        xpj2.x             -= xj2.x;
-        xpj2.y             -= xj2.y;
-        xpj2.z             -= xj2.z;
-        xpj3.x             -= xj3.x;
-        xpj3.y             -= xj3.y;
-        xpj3.z             -= xj3.z;
-#endif
-        psA->rij1.x         = xi.x - xj1.x;
-        psA->rij1.y         = xi.y - xj1.y;
-        psA->rij1.z         = xi.z - xj1.z;
-        psA->rij2.x         = xi.x - xj2.x;
-        psA->rij2.y         = xi.y - xj2.y;
-        psA->rij2.z         = xi.z - xj2.z;
-        psA->rij3.x         = xi.x - xj3.x;
-        psA->rij3.y         = xi.y - xj3.y;
-        psA->rij3.z         = xi.z - xj3.z;
-        psA->rij1sq         = psA->rij1.x * psA->rij1.x + psA->rij1.y * psA->rij1.y + psA->rij1.z * psA->rij1.z;
-        psA->rij2sq         = psA->rij2.x * psA->rij2.x + psA->rij2.y * psA->rij2.y + psA->rij2.z * psA->rij2.z;
-        psA->rij3sq         = psA->rij3.x * psA->rij3.x + psA->rij3.y * psA->rij3.y + psA->rij3.z * psA->rij3.z;
-        float ld1           = psA->d2 - psA->rij1sq;
-        float ld2           = psA->d2 - psA->rij2sq;
-        float ld3           = psA->d2 - psA->rij3sq;
-        bool converged = false;
-        int iteration = 0;
-        while (iteration < 15 && !converged)
-        {
-            converged = true;
-            float3 rpij;
-            rpij.x          = xpi.x - xpj1.x;
-            rpij.y          = xpi.y - xpj1.y;
-            rpij.z          = xpi.z - xpj1.z;
-		    float rpsqij    = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
-		    float rrpr      = psA->rij1.x * rpij.x + psA->rij1.y * rpij.y + psA->rij1.z * rpij.z; 
-		    float diff      = fabs(ld1 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance );
-            if (diff >= 1.0f)
-            {
-                float acor  = (ld1 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij1sq);
-                float3 dr;
-                dr.x    = psA->rij1.x * acor;
-                dr.y    = psA->rij1.y * acor;
-                dr.z    = psA->rij1.z * acor;
-		        xpi.x  += dr.x * psA->InvMassI;
-		        xpi.y  += dr.y * psA->InvMassI;
-		        xpi.z  += dr.z * psA->InvMassI;
-		        xpj1.x -= dr.x * invMassJ;
-		        xpj1.y -= dr.y * invMassJ;
-		        xpj1.z -= dr.z * invMassJ;
-                converged = false;
-            }
-            if (atomID.z != -1)
-            {
-                rpij.x          = xpi.x - xpj2.x;
-                rpij.y          = xpi.y - xpj2.y;
-                rpij.z          = xpi.z - xpj2.z;
-		        rpsqij          = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
-		        rrpr            = psA->rij2.x * rpij.x + psA->rij2.y * rpij.y + psA->rij2.z * rpij.z; 
-		        diff            = fabs(ld2 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance );
-                if (diff >= 1.0f)
-                {
-                    float acor  = (ld2 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij2sq);
-                    float3 dr;
-                    dr.x    = psA->rij2.x * acor;
-                    dr.y    = psA->rij2.y * acor;
-                    dr.z    = psA->rij2.z * acor;
-		            xpi.x  += dr.x * psA->InvMassI;
-		            xpi.y  += dr.y * psA->InvMassI;
-		            xpi.z  += dr.z * psA->InvMassI;
-		            xpj2.x -= dr.x * invMassJ;
-		            xpj2.y -= dr.y * invMassJ;
-		            xpj2.z -= dr.z * invMassJ;
-                    converged = false;
-                }
-            }
-            if (atomID.w != -1)
-            {
-                rpij.x          = xpi.x - xpj3.x;
-                rpij.y          = xpi.y - xpj3.y;
-                rpij.z          = xpi.z - xpj3.z;
-		        rpsqij          = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
-		        rrpr            = psA->rij3.x * rpij.x + psA->rij3.y * rpij.y + psA->rij3.z * rpij.z; 
-		        diff            = fabs(ld3 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance );
-                if (diff >= 1.0f)
-                {
-                    float acor  = (ld3 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij3sq);
-                    float3 dr;
-                    dr.x    = psA->rij3.x * acor;
-                    dr.y    = psA->rij3.y * acor;
-                    dr.z    = psA->rij3.z * acor;
-		            xpi.x  += dr.x * psA->InvMassI;
-		            xpi.y  += dr.y * psA->InvMassI;
-		            xpi.z  += dr.z * psA->InvMassI;
-		            xpj3.x -= dr.x * invMassJ;
-		            xpj3.y -= dr.y * invMassJ;
-		            xpj3.z -= dr.z * invMassJ;
-                    converged = false;
-                }
-            }
-            iteration++;
-        }
-        xpi.x += xi.x;
-        xpi.y += xi.y;
-        xpi.z += xi.z;
-        xpj1.x += xj1.x;
-        xpj1.y += xj1.y;
-        xpj1.z += xj1.z;
-        xpj2.x += xj2.x;
-        xpj2.y += xj2.y;
-        xpj2.z += xj2.z;
-        xpj3.x += xj3.x;
-        xpj3.y += xj3.y;
-        xpj3.z += xj3.z;
-        cSim.pPosq[atomID.x] = xpi;
-        cSim.pPosq[atomID.y] = xpj1;
-        if (atomID.z != -1)
-            cSim.pPosq[atomID.z] = xpj2;
-        if (atomID.w != -1)
-            cSim.pPosq[atomID.w] = xpj3;     
-        pos += blockDim.x * gridDim.x;
-    }
-}
-__global__ void kApplyNoShake_kernel()
-{
-    unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
-    while (pos < cSim.NonShakeConstraints)
-    {
-        int  atomID          = cSim.pNonShakeID[pos];
-        float4 apos          = cSim.pOldPosq[atomID];
-        float4 xpi           = cSim.pPosq[atomID];
-        xpi.x               += apos.x;
-        xpi.y               += apos.y;
-        xpi.z               += apos.z;
-        cSim.pPosq[atomID]   = xpi;
-        pos += blockDim.x * gridDim.x;
-    }
-}
-void kCPUShake2(gpuContext gpu)
-{
-}
-void kApplySecondShake(gpuContext gpu)
-{
-  //  printf("kApplySecondShake\n");
-  //  kCPUShake2(gpu);
-    if (gpu->sim.ShakeConstraints > 0)
-    {
-        kApplySecondShake_kernel<<<gpu->sim.blocks, gpu->sim.shake_threads_per_block>>>();
-        LAUNCHERROR("kApplySecondShake");
-    }
-    // handle non-Shake atoms
-#ifdef DeltaShake
-    if (gpu->sim.NonShakeConstraints > 0)
-    {
-        //fprintf( gpu->log, "kApplyNoShake_kernel %d %d \n", gpu->sim.blocks, gpu->sim.nonshake_threads_per_block); fflush( gpu->log );
-        kApplyNoShake_kernel<<<gpu->sim.blocks, gpu->sim.nonshake_threads_per_block>>>();
-        LAUNCHERROR("kApplyNoShake");
-    }
-#endif
-}
--- a/platforms/cuda/src/kernels/kVerletUpdate.cu
+++ b/platforms/cuda/src/kernels/kVerletUpdate.cu
@@ -35,8 +35,6 @@ using namespace std;
 #include "gputypes.h"
-#define DeltaShake
 static __constant__ cudaGmxSimulation cSim;
 void SetVerletUpdateSim(gpuContext gpu)
@@ -53,102 +51,11 @@ void GetVerletUpdateSim(gpuContext gpu)
    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
 }
-__global__ void kVerletUpdatePart1_kernel()
+// Include versions of the kernels with and with center of mass motion removal.
-{
-    unsigned int pos    = threadIdx.x + blockIdx.x * blockDim.x;
-    while (pos < cSim.atoms)
-    {
-        float4 apos             = cSim.pPosq[pos];
-        float4 velocity         = cSim.pVelm4[pos];
-        float4 force            = cSim.pForce4[pos];
-        float dtOverMass        = cSim.deltaT*velocity.w;
-        cSim.pOldPosq[pos]      = apos;        
-        velocity.x             += dtOverMass*force.x;
-        velocity.y             += dtOverMass*force.y;
-        velocity.z             += dtOverMass*force.z;
-#ifndef DeltaShake
-        apos.x                 += velocity.x*cSim.deltaT;
-        apos.y                 += velocity.y*cSim.deltaT;
-        apos.z                 += velocity.z*cSim.deltaT;
-#else
-        apos.x                  = velocity.x*cSim.deltaT;
-        apos.y                  = velocity.y*cSim.deltaT;
-        apos.z                  = velocity.z*cSim.deltaT;
-#endif
-        cSim.pPosqP[pos]        = apos;
-        cSim.pVelm4[pos]        = velocity;        
-        pos                    += blockDim.x * gridDim.x;
-    }
-}
-__global__ void kVerletUpdatePart1CM_kernel()
-{
-    extern __shared__ float3 sCM[];
-    unsigned int pos    = threadIdx.x + blockIdx.x * blockDim.x;
-    float3 CM           = { 0.0f, 0.0f, 0.0f};
-    float4 CM1          = { 0.0f, 0.0f, 0.0f, 0.0f };
-    // Read CM outputs from previous step
-    unsigned int cpos = threadIdx.x;
-    while (cpos < gridDim.x)
-    {
-        CM1             = cSim.pLinearMomentum[cpos];
-        CM.x           += CM1.x;
-        CM.y           += CM1.y;
-        CM.z           += CM1.z;
-        cpos           += blockDim.x;
-    }
-    sCM[threadIdx.x].x  = CM.x;
-    sCM[threadIdx.x].y  = CM.y;
-    sCM[threadIdx.x].z  = CM.z;
-    __syncthreads();
-    // Reduce CM
-    unsigned int offset = 1;
-    unsigned int mask   = 1;
-    while (offset < blockDim.x)
-    {
-        if (((threadIdx.x & mask) == 0) && (threadIdx.x + offset < blockDim.x))
-        {
-            sCM[threadIdx.x].x += sCM[threadIdx.x + offset].x;
-            sCM[threadIdx.x].y += sCM[threadIdx.x + offset].y;
-            sCM[threadIdx.x].z += sCM[threadIdx.x + offset].z;
-        }
-        mask = 2 * mask + 1;
-        offset *= 2;
-        __syncthreads();
-    }       
-    while (pos < cSim.atoms)
-    {
-        float4 apos             = cSim.pPosq[pos];
-        float4 velocity         = cSim.pVelm4[pos];
-        float4 force            = cSim.pForce4[pos];
-        float dtOverMass        = cSim.deltaT*velocity.w;
-        cSim.pOldPosq[pos]      = apos;        
-        velocity.x             += dtOverMass*force.x-sCM[0].x;
-        velocity.y             += dtOverMass*force.y-sCM[0].y;
-        velocity.z             += dtOverMass*force.z-sCM[0].z;
-#ifndef DeltaShake
+#include "kVerletUpdate.h"
-        apos.x                 += velocity.x*cSim.deltaT;
+#define REMOVE_CM
-        apos.y                 += velocity.y*cSim.deltaT;
+#include "kVerletUpdate.h"
-        apos.z                 += velocity.z*cSim.deltaT;
-#else
-        apos.x                  = velocity.x*cSim.deltaT;
-        apos.y                  = velocity.y*cSim.deltaT;
-        apos.z                  = velocity.z*cSim.deltaT;
-#endif
-        cSim.pPosqP[pos]        = apos;
-        cSim.pVelm4[pos]        = velocity;        
-        pos                    += blockDim.x * gridDim.x;
-    }
-}
 void kVerletUpdatePart1(gpuContext gpu)
 {
@@ -166,105 +73,6 @@ void kVerletUpdatePart1(gpuContext gpu)
    }
 }
-__global__ void kVerletUpdatePart2_kernel()
-{
-    unsigned int pos            = threadIdx.x + blockIdx.x * blockDim.x;
-    while (pos < cSim.atoms)
-    {
-        float4 velocity         = cSim.pVelm4[pos];
-        float4 apos             = cSim.pPosq[pos];
-        float4 xPrime           = cSim.pPosqP[pos];
-#ifndef DeltaShake
-        velocity.x              = cSim.oneOverDeltaT*(xPrime.x-apos.x);
-        velocity.y              = cSim.oneOverDeltaT*(xPrime.y-apos.y);
-        velocity.z              = cSim.oneOverDeltaT*(xPrime.z-apos.z);
-#else
-        velocity.x              = cSim.oneOverDeltaT*(xPrime.x);
-        velocity.y              = cSim.oneOverDeltaT*(xPrime.y);
-        velocity.z              = cSim.oneOverDeltaT*(xPrime.z);
-        xPrime.x               += apos.x;
-        xPrime.y               += apos.y;
-        xPrime.z               += apos.z;
-#endif
-        cSim.pPosq[pos]         = xPrime;
-        cSim.pVelm4[pos]        = velocity;
-        pos                    += blockDim.x * gridDim.x;    
-    }
-}
-__global__ void kVerletUpdatePart2CM_kernel()
-{
-    extern __shared__ float3 sCM[];
-    unsigned int pos            = threadIdx.x + blockIdx.x * blockDim.x;
-    float3 CM                   = {0.0f, 0.0f, 0.0f};
-    while (pos < cSim.atoms)
-    {
-        float4 velocity         = cSim.pVelm4[pos];
-        float4 apos             = cSim.pPosq[pos];
-        float4 xPrime           = cSim.pPosqP[pos];
-        float mass              = 1.0f / velocity.w;
-#ifndef DeltaShake
-        velocity.x              = cSim.oneOverDeltaT*(xPrime.x-apos.x);
-        velocity.y              = cSim.oneOverDeltaT*(xPrime.y-apos.y);
-        velocity.z              = cSim.oneOverDeltaT*(xPrime.z-apos.z);
-#else
-        velocity.x              = cSim.oneOverDeltaT*(xPrime.x);
-        velocity.y              = cSim.oneOverDeltaT*(xPrime.y);
-        velocity.z              = cSim.oneOverDeltaT*(xPrime.z);
-        xPrime.x               += apos.x;
-        xPrime.y               += apos.y;
-        xPrime.z               += apos.z;
-#endif
-        CM.x                   += mass * velocity.x;
-        CM.y                   += mass * velocity.y;
-        CM.z                   += mass * velocity.z;
-        cSim.pPosq[pos]         = xPrime;
-        cSim.pVelm4[pos]        = velocity;
-        pos                    += blockDim.x * gridDim.x;    
-    }
-    // Scale CM
-    CM.x *= cSim.inverseTotalMass;
-    CM.y *= cSim.inverseTotalMass;
-    CM.z *= cSim.inverseTotalMass;
-    sCM[threadIdx.x] = CM;
-    __syncthreads();
-    // Reduce CM for CTA
-    unsigned int offset = 1;
-    unsigned int mask   = 1;
-    while (offset < blockDim.x)
-    {
-        if (((threadIdx.x & mask) == 0) && (threadIdx.x + offset < blockDim.x))
-        {
-            sCM[threadIdx.x].x += sCM[threadIdx.x + offset].x;
-            sCM[threadIdx.x].y += sCM[threadIdx.x + offset].y;
-            sCM[threadIdx.x].z += sCM[threadIdx.x + offset].z;
-        }
-        mask = 2 * mask + 1;
-        offset *= 2;
-        __syncthreads();
-    }
-    if (threadIdx.x == 0)
-    {
-        float4 CM;
-        CM.x                                = sCM[0].x;
-        CM.y                                = sCM[0].y;
-        CM.z                                = sCM[0].z;
-        CM.w                                = 0.0f;
-        cSim.pLinearMomentum[blockIdx.x]    = CM;
-    }  
-}
 void kVerletUpdatePart2(gpuContext gpu)
 {
 //    printf("kVerletUpdatePart2\n");

--- a/platforms/cuda/src/kernels/kVerletUpdate.h
+++ b/platforms/cuda/src/kernels/kVerletUpdate.h
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+/**
+ * This file contains the kernels for Verlet integration.  It is included
+ * several times in kVerletUpdate.cu with different #defines to generate
+ * different versions of the kernels.
+ */
+#ifdef REMOVE_CM
+__global__ void kVerletUpdatePart1CM_kernel()
+#else
+__global__ void kVerletUpdatePart1_kernel()
+#endif
+{
+    unsigned int pos    = threadIdx.x + blockIdx.x * blockDim.x;
+#ifdef REMOVE_CM
+    extern __shared__ float3 sCM[];
+    float3 CM           = { 0.0f, 0.0f, 0.0f};
+    float4 CM1          = { 0.0f, 0.0f, 0.0f, 0.0f };
+    // Read CM outputs from previous step
+    unsigned int cpos = threadIdx.x;
+    while (cpos < gridDim.x)
+    {
+        CM1             = cSim.pLinearMomentum[cpos];
+        CM.x           += CM1.x;
+        CM.y           += CM1.y;
+        CM.z           += CM1.z;
+        cpos           += blockDim.x;
+    }
+    sCM[threadIdx.x].x  = CM.x;
+    sCM[threadIdx.x].y  = CM.y;
+    sCM[threadIdx.x].z  = CM.z;
+    __syncthreads();
+    // Reduce CM
+    unsigned int offset = 1;
+    unsigned int mask   = 1;
+    while (offset < blockDim.x)
+    {
+        if (((threadIdx.x & mask) == 0) && (threadIdx.x + offset < blockDim.x))
+        {
+            sCM[threadIdx.x].x += sCM[threadIdx.x + offset].x;
+            sCM[threadIdx.x].y += sCM[threadIdx.x + offset].y;
+            sCM[threadIdx.x].z += sCM[threadIdx.x + offset].z;
+        }
+        mask = 2 * mask + 1;
+        offset *= 2;
+        __syncthreads();
+    }
+#endif
+    while (pos < cSim.atoms)
+    {
+        float4 apos             = cSim.pPosq[pos];
+        float4 velocity         = cSim.pVelm4[pos];
+        float4 force            = cSim.pForce4[pos];
+        float dtOverMass        = cSim.deltaT*velocity.w;
+        cSim.pOldPosq[pos]      = apos;
+        velocity.x             += dtOverMass*force.x;
+        velocity.y             += dtOverMass*force.y;
+        velocity.z             += dtOverMass*force.z;
+#ifdef REMOVE_CM
+        velocity.x             -= sCM[0].x;
+        velocity.y             -= sCM[0].y;
+        velocity.z             -= sCM[0].z;
+#endif
+        apos.x                  = velocity.x*cSim.deltaT;
+        apos.y                  = velocity.y*cSim.deltaT;
+        apos.z                  = velocity.z*cSim.deltaT;
+        cSim.pPosqP[pos]        = apos;
+        cSim.pVelm4[pos]        = velocity;
+        pos                    += blockDim.x * gridDim.x;
+    }
+}
+#ifdef REMOVE_CM
+__global__ void kVerletUpdatePart2CM_kernel()
+#else
+__global__ void kVerletUpdatePart2_kernel()
+#endif
+{
+    unsigned int pos            = threadIdx.x + blockIdx.x * blockDim.x;
+#ifdef REMOVE_CM
+    extern __shared__ float3 sCM[];
+    float3 CM                   = {0.0f, 0.0f, 0.0f};
+#endif
+    while (pos < cSim.atoms)
+    {
+        float4 velocity         = cSim.pVelm4[pos];
+        float4 apos             = cSim.pPosq[pos];
+        float4 xPrime           = cSim.pPosqP[pos];
+        velocity.x              = cSim.oneOverDeltaT*(xPrime.x);
+        velocity.y              = cSim.oneOverDeltaT*(xPrime.y);
+        velocity.z              = cSim.oneOverDeltaT*(xPrime.z);
+        xPrime.x               += apos.x;
+        xPrime.y               += apos.y;
+        xPrime.z               += apos.z;
+#ifdef REMOVE_CM
+        float mass              = 1.0f / velocity.w;
+        CM.x                   += mass * velocity.x;
+        CM.y                   += mass * velocity.y;
+        CM.z                   += mass * velocity.z;
+#endif
+        cSim.pPosq[pos]         = xPrime;
+        cSim.pVelm4[pos]        = velocity;
+        pos                    += blockDim.x * gridDim.x;
+    }
+#ifdef REMOVE_CM
+    // Scale CM
+    CM.x *= cSim.inverseTotalMass;
+    CM.y *= cSim.inverseTotalMass;
+    CM.z *= cSim.inverseTotalMass;
+    sCM[threadIdx.x] = CM;
+    __syncthreads();
+    // Reduce CM for CTA
+    unsigned int offset = 1;
+    unsigned int mask   = 1;
+    while (offset < blockDim.x)
+    {
+        if (((threadIdx.x & mask) == 0) && (threadIdx.x + offset < blockDim.x))
+        {
+            sCM[threadIdx.x].x += sCM[threadIdx.x + offset].x;
+            sCM[threadIdx.x].y += sCM[threadIdx.x + offset].y;
+            sCM[threadIdx.x].z += sCM[threadIdx.x + offset].z;
+        }
+        mask = 2 * mask + 1;
+        offset *= 2;
+        __syncthreads();
+    }
+    if (threadIdx.x == 0)
+    {
+        float4 CM;
+        CM.x                                = sCM[0].x;
+        CM.y                                = sCM[0].y;
+        CM.z                                = sCM[0].z;
+        CM.w                                = 0.0f;
+        cSim.pLinearMomentum[blockIdx.x]    = CM;
+    }
+#endif
+}