Converted Cuda platform to use improved Langevin integrator

8725cede · Peter Eastman · 15a8478e · 8725cede · 8725cede · 8725cede
Commit 8725cede authored Apr 22, 2010 by Peter Eastman
9 changed files
--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -869,9 +869,6 @@ void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const Langev
        prevStepSize = stepSize;
    }
    kLangevinUpdatePart1(gpu);
-    kApplyFirstShake(gpu);
-    kApplyFirstSettle(gpu);
-    kApplyFirstCCMA(gpu);
    if (data.removeCM)
        if (data.stepCount%data.cmMotionFrequency == 0)
            gpu->bCalculateCM = true;
@@ -991,9 +988,6 @@ void CudaIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, cons
    float maxStepSize = (float)(maxTime-data.time);
    kSelectLangevinStepSize(gpu, maxStepSize);
    kLangevinUpdatePart1(gpu);
-    kApplyFirstShake(gpu);
-    kApplyFirstSettle(gpu);
-    kApplyFirstCCMA(gpu);
    if (data.removeCM)
        if (data.stepCount%data.cmMotionFrequency == 0)
            gpu->bCalculateCM = true;

--- a/platforms/cuda/src/kernels/cudatypes.h
+++ b/platforms/cuda/src/kernels/cudatypes.h
@@ -471,8 +471,6 @@ struct cudaGmxSimulation {
    float4*         pPosqP;                         // Pointer to mid-integration atom positions
    float4*         pOldPosq;                       // Pointer to old atom positions
    float4*         pVelm4;                         // Pointer to atom velocity and inverse mass
-    float4*         pvVector4;                      // Pointer to atom v Vector
-    float4*         pxVector4;                      // Pointer to atom x Vector
    float4*         pForce4;                        // Pointer to force data
    float*          pEnergy;                        // Pointer to energy output buffer
    float*          pBornForce;                     // Pointer to Born force data
@@ -482,15 +480,12 @@ struct cudaGmxSimulation {
    float4*         pLinearMomentum;                // Pointer to linear momentum
    
    // Random numbers
-    float4*         pRandom4a;                      // Pointer to first set of 4 random numbers
-    float4*         pRandom4b;                      // Pointer to second set of 4 random numbers
-    float2*         pRandom2a;                      // Pointer to first set of 2 random numbers
-    float2*         pRandom2b;                      // Pointer to second set of 2 random numbers
+    float4*         pRandom4;                       // Pointer to 4 random numbers
+    float2*         pRandom2;                       // Pointer to 2 random numbers
    uint4*          pRandomSeed;                    // Pointer to random seeds
    int*            pRandomPosition;                // Pointer to random number positions
    unsigned int    randoms;                        // Number of randoms
    unsigned int    totalRandoms;                   // Number of randoms plus overflow.
-    unsigned int    totalRandomsTimesTwo;           // Used for generating randoms
    unsigned int    randomIterations;               // Number of iterations before regenerating randoms
    unsigned int    randomFrames;                   // Number of frames of random numbers
 };

--- a/platforms/cuda/src/kernels/gpu.cpp
+++ b/platforms/cuda/src/kernels/gpu.cpp
@@ -1651,10 +1651,6 @@ int gpuAllocateInitialBuffers(gpuContext gpu)
    gpu->sim.pOldPosq                   = gpu->psOldPosq4->_pDevStream[0];
    gpu->psVelm4                        = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, 1, "Velm");
    gpu->sim.pVelm4                     = gpu->psVelm4->_pDevStream[0];
-    gpu->psvVector4                     = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, 1, "vVector");
-    gpu->sim.pvVector4                  = gpu->psvVector4->_pDevStream[0];
-    gpu->psxVector4                     = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, 1, "xVector");
-    gpu->sim.pxVector4                  = gpu->psxVector4->_pDevStream[0];
    gpu->psBornRadii                    = new CUDAStream<float>(gpu->sim.paddedNumberOfAtoms, 1, "BornRadii");
    gpu->sim.pBornRadii                 = gpu->psBornRadii->_pDevStream[0];
    gpu->psObcChain                     = new CUDAStream<float>(gpu->sim.paddedNumberOfAtoms, 1, "ObcChain");
@@ -1669,7 +1665,7 @@ int gpuAllocateInitialBuffers(gpuContext gpu)
    gpu->sim.pStepSize                  = gpu->psStepSize->_pDevStream[0];
    (*gpu->psStepSize)[0] = make_float2(0.0f, 0.0f);
    gpu->psStepSize->Upload();
-    gpu->psLangevinParameters           = new CUDAStream<float>(11, 1, "LangevinParameters");
+    gpu->psLangevinParameters           = new CUDAStream<float>(3, 1, "LangevinParameters");
    gpu->sim.pLangevinParameters        = gpu->psLangevinParameters->_pDevStream[0];
    gpu->pAtomSymbol                    = new unsigned char[gpu->natoms];
    gpu->psAtomIndex                    = new CUDAStream<int>(gpu->sim.paddedNumberOfAtoms, 1, "AtomIndex");
@@ -1684,15 +1680,12 @@ int gpuAllocateInitialBuffers(gpuContext gpu)
    gpu->sim.randomIterations           = gpu->sim.randomFrames;
    gpu->sim.randoms                    = gpu->sim.randomFrames * gpu->sim.paddedNumberOfAtoms;
    gpu->sim.totalRandoms               = gpu->sim.randoms + gpu->sim.paddedNumberOfAtoms;
-    gpu->sim.totalRandomsTimesTwo       = gpu->sim.totalRandoms * 2;
-    gpu->psRandom4                      = new CUDAStream<float4>(gpu->sim.totalRandomsTimesTwo, 1, "Random4");
-    gpu->psRandom2                      = new CUDAStream<float2>(gpu->sim.totalRandomsTimesTwo, 1, "Random2");
+    gpu->psRandom4                      = new CUDAStream<float4>(gpu->sim.totalRandoms, 1, "Random4");
+    gpu->psRandom2                      = new CUDAStream<float2>(gpu->sim.totalRandoms, 1, "Random2");
    gpu->psRandomPosition               = new CUDAStream<int>(gpu->sim.blocks, 1, "RandomPosition");
    gpu->psRandomSeed                   = new CUDAStream<uint4>(gpu->sim.blocks * gpu->sim.random_threads_per_block, 1, "RandomSeed");
-    gpu->sim.pRandom4a                  = gpu->psRandom4->_pDevStream[0];
-    gpu->sim.pRandom2a                  = gpu->psRandom2->_pDevStream[0];
-    gpu->sim.pRandom4b                  = gpu->psRandom4->_pDevStream[0] + gpu->sim.totalRandoms;
-    gpu->sim.pRandom2b                  = gpu->psRandom2->_pDevStream[0] + gpu->sim.totalRandoms;
+    gpu->sim.pRandom4                   = gpu->psRandom4->_pDevStream[0];
+    gpu->sim.pRandom2                   = gpu->psRandom2->_pDevStream[0];
    gpu->sim.pRandomPosition            = gpu->psRandomPosition->_pDevStream[0];
    gpu->sim.pRandomSeed                = gpu->psRandomSeed->_pDevStream[0];

@@ -1884,14 +1877,6 @@ void* gpuInit(int numAtoms, unsigned int device, bool useBlockingSync)

    gpu->natoms = numAtoms;
    gpuAllocateInitialBuffers(gpu);
-    for (int i = 0; i < gpu->natoms; i++)
-    {
-        (*gpu->psxVector4)[i].x = 0.0f;
-        (*gpu->psxVector4)[i].y = 0.0f;
-        (*gpu->psxVector4)[i].z = 0.0f;
-        (*gpu->psxVector4)[i].w = 0.0f;
-    }
-    gpu->psxVector4->Upload();

    gpu->iterations = 0;
    gpu->sim.update_threads_per_block               = (gpu->natoms + gpu->sim.blocks - 1) / gpu->sim.blocks;
@@ -2023,60 +2008,12 @@ void gpuSetLangevinIntegrationParameters(gpuContext gpu, float tau, float deltaT
    gpu->sim.tau                    = tau;
    gpu->sim.T                      = temperature;
    gpu->sim.kT                     = BOLTZ * gpu->sim.T;
-    double GDT                       = gpu->sim.deltaT / gpu->sim.tau;
-    double EPH                       = exp(0.5 * GDT);
-    double EMH                       = exp(-0.5 * GDT);
-    double EP                        = exp(GDT);
-    double EM                        = exp(-GDT);
-    double B, C, D;
-    if (GDT >= 0.1)
-    {
-        double term1 = EPH - 1.0;
-        term1                      *= term1;
-        B                           = GDT * (EP - 1.0) - 4.0 * term1;
-        C                           = GDT - 3.0 + 4.0 * EMH - EM;
-        D                           = 2.0 - EPH - EMH;
-    }
-    else
-    {
-        double term1                 = 0.5 * GDT;
-        double term2                 = term1 * term1;
-        double term4                 = term2 * term2;
-
-        double third                 = 1.0 / 3.0;
-        double o7_9                  = 7.0 / 9.0;
-        double o1_12                 = 1.0 / 12.0;
-        double o17_90                = 17.0 / 90.0;
-        double o7_30                 = 7.0 / 30.0;
-        double o31_1260              = 31.0 / 1260.0;
-        double o_360                 = 1.0 / 360.0;
-
-        B                           = term4 * (third + term1 * (third + term1 * (o17_90 + term1 * o7_9)));
-        C                           = term2 * term1 * (2.0 * third + term1 * (-0.5 + term1 * (o7_30 + term1 * (-o1_12 + term1 * o31_1260))));
-        D                           = term2 * (-1.0 + term2 * (-o1_12 - term2 * o_360));
-    }
-    double DOverTauC                 = D / (gpu->sim.tau * C);
-    double TauOneMinusEM             = gpu->sim.tau * (1.0-EM);
-    double TauDOverEMMinusOne        = gpu->sim.tau * D / (EM - 1.0);
-    double fix1                      = gpu->sim.tau * (EPH - EMH);
-    if (fix1 == 0.0)
-        fix1 = deltaT;
-    double oneOverFix1               = 1.0 / fix1;
-    double V                         = sqrt(gpu->sim.kT * (1.0 - EM));
-    double X                         = gpu->sim.tau * sqrt(gpu->sim.kT * C);
-    double Yv                        = sqrt(gpu->sim.kT * B / C);
-    double Yx                        = gpu->sim.tau * sqrt(gpu->sim.kT * B / (1.0 - EM));
-    (*gpu->psLangevinParameters)[0] = (float) EM;
-    (*gpu->psLangevinParameters)[1] = (float) EM;
-    (*gpu->psLangevinParameters)[2] = (float) DOverTauC;
-    (*gpu->psLangevinParameters)[3] = (float) TauOneMinusEM;
-    (*gpu->psLangevinParameters)[4] = (float) TauDOverEMMinusOne;
-    (*gpu->psLangevinParameters)[5] = (float) V;
-    (*gpu->psLangevinParameters)[6] = (float) X;
-    (*gpu->psLangevinParameters)[7] = (float) Yv;
-    (*gpu->psLangevinParameters)[8] = (float) Yx;
-    (*gpu->psLangevinParameters)[9] = (float) fix1;
-    (*gpu->psLangevinParameters)[10] = (float) oneOverFix1;
+    double vscale = exp(-deltaT/tau);
+    double fscale = (1-vscale)*tau;
+    double noisescale = sqrt(2*gpu->sim.kT/tau)*sqrt(0.5*(1-vscale*vscale)*tau);
+    (*gpu->psLangevinParameters)[0] = (float) vscale;
+    (*gpu->psLangevinParameters)[1] = (float) fscale;
+    (*gpu->psLangevinParameters)[2] = (float) noisescale;
    gpu->psLangevinParameters->Upload();
    gpu->psStepSize->Download();
    (*gpu->psStepSize)[0].y = deltaT;
@@ -2129,8 +2066,6 @@ void gpuShutDown(gpuContext gpu)
    delete gpu->psVelm4;
    delete gpu->psForce4;
    delete gpu->psEnergy;
-    delete gpu->psxVector4;
-    delete gpu->psvVector4;
    delete gpu->psSigEps2;
    if (gpu->psCustomParams != NULL)
        delete gpu->psCustomParams;

--- a/platforms/cuda/src/kernels/gputypes.h
+++ b/platforms/cuda/src/kernels/gputypes.h
@@ -101,8 +101,6 @@ struct _gpuContext {
    CUDAStream<float4>* psVelm4;
    CUDAStream<float4>* psForce4;
    CUDAStream<float>*  psEnergy;           // Energy output buffer
-    CUDAStream<float4>* psxVector4;
-    CUDAStream<float4>* psvVector4;
    CUDAStream<float2>* psSigEps2; 
    CUDAStream<float4>* psCustomParams;     // Atom parameters for custom nonbonded force
    CUDAStream<int4>* psCustomBondID;             // Atom indices for custom bonds

--- a/platforms/cuda/src/kernels/kBrownianUpdate.cu
+++ b/platforms/cuda/src/kernels/kBrownianUpdate.cu
@@ -51,14 +51,14 @@ void GetBrownianUpdateSim(gpuContext gpu)
    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
 }

-__global__ 
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_UPDATE_THREADS_PER_BLOCK, 1)
-#elif (__CUDA_ARCH__ >= 130)
-__launch_bounds__(GT2XX_UPDATE_THREADS_PER_BLOCK, 1)
-#else
-__launch_bounds__(G8X_UPDATE_THREADS_PER_BLOCK, 1)
-#endif
+__global__ 
+#if (__CUDA_ARCH__ >= 200)
+__launch_bounds__(GF1XX_UPDATE_THREADS_PER_BLOCK, 1)
+#elif (__CUDA_ARCH__ >= 130)
+__launch_bounds__(GT2XX_UPDATE_THREADS_PER_BLOCK, 1)
+#else
+__launch_bounds__(G8X_UPDATE_THREADS_PER_BLOCK, 1)
+#endif
 void kBrownianUpdatePart1_kernel()
 {
    unsigned int pos    = threadIdx.x + blockIdx.x * blockDim.x;
@@ -67,7 +67,7 @@ void kBrownianUpdatePart1_kernel()
    
    while (pos < cSim.atoms)
    {
-        float4 random4a         = cSim.pRandom4a[rpos + pos];
+        float4 random4a         = cSim.pRandom4[rpos + pos];
        float4 apos             = cSim.pPosq[pos];
        float4 force            = cSim.pForce4[pos];
        float invMass           = cSim.pVelm4[pos].w;
@@ -90,15 +90,15 @@ void kBrownianUpdatePart1(gpuContext gpu)
    LAUNCHERROR("kBrownianUpdatePart1");
 }

-__global__ 
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_UPDATE_THREADS_PER_BLOCK, 1)
-#elif (__CUDA_ARCH__ >= 130)
-__launch_bounds__(GT2XX_UPDATE_THREADS_PER_BLOCK, 1)
-#else
-__launch_bounds__(G8X_UPDATE_THREADS_PER_BLOCK, 1)
-#endif
-void kBrownianUpdatePart2_kernel()
+__global__ 
+#if (__CUDA_ARCH__ >= 200)
+__launch_bounds__(GF1XX_UPDATE_THREADS_PER_BLOCK, 1)
+#elif (__CUDA_ARCH__ >= 130)
+__launch_bounds__(GT2XX_UPDATE_THREADS_PER_BLOCK, 1)
+#else
+__launch_bounds__(G8X_UPDATE_THREADS_PER_BLOCK, 1)
+#endif
+void kBrownianUpdatePart2_kernel()
 {
    unsigned int pos            = threadIdx.x + blockIdx.x * blockDim.x;
    unsigned int rpos           = cSim.pRandomPosition[blockIdx.x];

--- a/platforms/cuda/src/kernels/kCalculateAndersenThermostat.cu
+++ b/platforms/cuda/src/kernels/kCalculateAndersenThermostat.cu
@@ -62,7 +62,7 @@ __global__ void kCalculateAndersenThermostat_kernel()
    while (pos < cSim.atoms)
    {
        float4 velocity         = cSim.pVelm4[pos];
-        float4 random4a         = cSim.pRandom4a[rpos + pos];
+        float4 random4a         = cSim.pRandom4[rpos + pos];
        float scale = (random4a.w > -randomRange && random4a.w < randomRange ? 0.0f : 1.0f);
        float add = (1.0f-scale)*sqrt(cSim.kT*velocity.w);
        velocity.x = scale*velocity.x + add*random4a.x;

--- a/platforms/cuda/src/kernels/kLangevinUpdate.cu
+++ b/platforms/cuda/src/kernels/kLangevinUpdate.cu
@@ -34,7 +34,7 @@ using namespace std;

 #include "gputypes.h"

-enum {EM, EM_V, DOverTauC, TauOneMinusEM_V, TauDOverEMMinusOne, V, X, Yv, Yx, Fix1, OneOverFix1, MaxParams};
+enum {VelScale, ForceScale, NoiseScale, MaxParams};

 static __constant__ cudaGmxSimulation cSim;

@@ -101,15 +101,15 @@ void kLangevinUpdatePart2(gpuContext gpu)
 }


-__global__ 
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_UPDATE_THREADS_PER_BLOCK, 1)
-#elif (__CUDA_ARCH__ >= 130)
-__launch_bounds__(GT2XX_UPDATE_THREADS_PER_BLOCK, 1)
-#else
-__launch_bounds__(G8X_UPDATE_THREADS_PER_BLOCK, 1)
-#endif
-void kSelectLangevinStepSize_kernel(float maxStepSize)
+__global__ 
+#if (__CUDA_ARCH__ >= 200)
+__launch_bounds__(GF1XX_UPDATE_THREADS_PER_BLOCK, 1)
+#elif (__CUDA_ARCH__ >= 130)
+__launch_bounds__(GT2XX_UPDATE_THREADS_PER_BLOCK, 1)
+#else
+__launch_bounds__(G8X_UPDATE_THREADS_PER_BLOCK, 1)
+#endif
+void kSelectLangevinStepSize_kernel(float maxStepSize)
 {
    // Calculate the error.

@@ -151,53 +151,12 @@ void kSelectLangevinStepSize_kernel(float maxStepSize)

        // Recalculate the integration parameters.

-        float gdt                  = newStepSize / cSim.tau;
-        float eph                  = exp(0.5f * gdt);
-        float emh                  = exp(-0.5f * gdt);
-        float ep                   = exp(gdt);
-        float em                   = exp(-gdt);
-        float em_v                 = exp(-0.5f*(oldStepSize+newStepSize)/cSim.tau);
-        float b, c, d;
-        if (gdt >= 0.1f)
-        {
-            float term1 = eph - 1.0f;
-            term1                 *= term1;
-            b                      = gdt * (ep - 1.0f) - 4.0f * term1;
-            c                      = gdt - 3.0f + 4.0f * emh - em;
-            d                      = 2.0f - eph - emh;
-        }
-        else
-        {
-            float term1            = 0.5f * gdt;
-            float term2            = term1 * term1;
-            float term4            = term2 * term2;
-
-            float third            = 1.0f / 3.0f;
-            float o7_9             = 7.0f / 9.0f;
-            float o1_12            = 1.0f / 12.0f;
-            float o17_90           = 17.0f / 90.0f;
-            float o7_30            = 7.0f / 30.0f;
-            float o31_1260         = 31.0f / 1260.0f;
-            float o_360            = 1.0f / 360.0f;
-
-            b                      = term4 * (third + term1 * (third + term1 * (o17_90 + term1 * o7_9)));
-            c                      = term2 * term1 * (2.0f * third + term1 * (-0.5f + term1 * (o7_30 + term1 * (-o1_12 + term1 * o31_1260))));
-            d                      = term2 * (-1.0f + term2 * (-o1_12 - term2 * o_360));
-        }
-        float fix1                 = cSim.tau * (eph - emh);
-        if (fix1 == 0.0f)
-            fix1 = newStepSize;
-        params[EM]                 = em;
-        params[EM_V]               = em_v;
-        params[DOverTauC]          = d / (cSim.tau * c);
-        params[TauOneMinusEM_V]    = cSim.tau * (1.0f-em_v);
-        params[TauDOverEMMinusOne] = cSim.tau * d / (em - 1.0f);
-        params[Fix1]               = fix1;
-        params[OneOverFix1]        = 1.0f / fix1;
-        params[V]                  = sqrt(cSim.kT * (1.0f - em));
-        params[X]                  = cSim.tau * sqrt(cSim.kT * c);
-        params[Yv]                 = sqrt(cSim.kT * b / c);
-        params[Yx]                 = cSim.tau * sqrt(cSim.kT * b / (1.0f - em));
+        float vscale = exp(-newStepSize/cSim.tau);
+        float fscale = (1-vscale)*cSim.tau;
+        float noisescale = sqrt(2*cSim.kT/cSim.tau)*sqrt(0.5f*(1-vscale*vscale)*cSim.tau);
+        params[VelScale] = vscale;
+        params[ForceScale] = fscale;
+        params[NoiseScale] = noisescale;
    }
    __syncthreads();
    if (threadIdx.x < MaxParams)

--- a/platforms/cuda/src/kernels/kLangevinUpdate.h
+++ b/platforms/cuda/src/kernels/kLangevinUpdate.h
@@ -24,6 +24,7 @@
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
 * -------------------------------------------------------------------------- */

+#include "cudatypes.h"

 /**
 * This file contains the kernels for Langevin integration.  It is included
@@ -45,12 +46,12 @@ void kLangevinUpdatePart1CM_kernel()
 void kLangevinUpdatePart1_kernel()
 #endif
 {
-    __shared__ volatile float params[MaxParams];
-    if (threadIdx.x < MaxParams)
-        params[threadIdx.x] = cSim.pLangevinParameters[threadIdx.x];
-    __syncthreads();
    unsigned int pos    = threadIdx.x + blockIdx.x * blockDim.x;
    unsigned int rpos   = cSim.pRandomPosition[blockIdx.x];
+    float vscale = cSim.pLangevinParameters[VelScale];
+    float fscale = cSim.pLangevinParameters[ForceScale];
+    float noisescale = cSim.pLangevinParameters[NoiseScale];
+
 #ifdef REMOVE_CM
    extern __shared__ float3 sCM[];
    float3 CM           = { 0.0f, 0.0f, 0.0f};
@@ -91,45 +92,19 @@ void kLangevinUpdatePart1_kernel()
    while (pos < cSim.atoms)
    {
        float4 velocity         = cSim.pVelm4[pos];
-        float4 xVector          = cSim.pxVector4[pos];
-        float4 random4a         = cSim.pRandom4a[rpos + pos];
-        float2 random2a         = cSim.pRandom2a[rpos + pos];
-        float4 apos             = cSim.pPosq[pos];
+        float4 random4a         = cSim.pRandom4[rpos + pos];
        float4 force            = cSim.pForce4[pos];

-        float3 Vmh;
        float sqrtInvMass       = sqrt(velocity.w);
-        Vmh.x                   = xVector.x * params[DOverTauC] + sqrtInvMass * params[Yv] * random4a.x;
-        Vmh.y                   = xVector.y * params[DOverTauC] + sqrtInvMass * params[Yv] * random4a.y;
-        Vmh.z                   = xVector.z * params[DOverTauC] + sqrtInvMass * params[Yv] * random4a.z;
-        float4 vVector;
-        vVector.x               = sqrtInvMass * params[V] * random4a.w;
-        vVector.y               = sqrtInvMass * params[V] * random2a.x;
-        vVector.z               = sqrtInvMass * params[V] * random2a.y;
-        vVector.w               = 0.0f;
-        cSim.pvVector4[pos]     = vVector;
-        velocity.x              = velocity.x * params[EM_V] +
-                                  velocity.w * force.x * params[TauOneMinusEM_V] +
-                                  vVector.x -
-                                  params[EM] * Vmh.x;
-        velocity.y              = velocity.y * params[EM_V] +
-                                  velocity.w * force.y * params[TauOneMinusEM_V] +
-                                  vVector.y -
-                                  params[EM] * Vmh.y;
-        velocity.z              = velocity.z * params[EM_V] +
-                                  velocity.w * force.z * params[TauOneMinusEM_V] +
-                                  vVector.z -
-                                  params[EM] * Vmh.z;
+        velocity.x = vscale*velocity.x + fscale*velocity.w*force.x + noisescale*sqrtInvMass*random4a.x;
+        velocity.y = vscale*velocity.y + fscale*velocity.w*force.y + noisescale*sqrtInvMass*random4a.y;
+        velocity.z = vscale*velocity.z + fscale*velocity.w*force.z + noisescale*sqrtInvMass*random4a.z;
 #ifdef REMOVE_CM
        velocity.x             -= sCM[0].x;
        velocity.y             -= sCM[0].y;
        velocity.z             -= sCM[0].z;
 #endif
-        cSim.pOldPosq[pos]      = apos;
-        apos.x                  = velocity.x * params[Fix1];
-        apos.y                  = velocity.y * params[Fix1];
-        apos.z                  = velocity.z * params[Fix1];
-        cSim.pPosqP[pos]        = apos;
+        cSim.pOldPosq[pos]      = cSim.pPosq[pos];
        cSim.pVelm4[pos]        = velocity;
        pos                    += blockDim.x * gridDim.x;
    }
@@ -149,12 +124,9 @@ void kLangevinUpdatePart2CM_kernel()
 void kLangevinUpdatePart2_kernel()
 #endif
 {
-    __shared__ float params[MaxParams];
-    if (threadIdx.x < MaxParams)
-        params[threadIdx.x] = cSim.pLangevinParameters[threadIdx.x];
-    __syncthreads();
    unsigned int pos            = threadIdx.x + blockIdx.x * blockDim.x;
    unsigned int rpos           = cSim.pRandomPosition[blockIdx.x];
+    float dt = cSim.pStepSize[0].y;
 #ifdef REMOVE_CM
    extern __shared__ float3 sCM[];
    float3 CM                   = {0.0f, 0.0f, 0.0f};
@@ -164,16 +136,6 @@ void kLangevinUpdatePart2_kernel()
    while (pos < cSim.atoms)
    {
        float4 velocity         = cSim.pVelm4[pos];
-        float4 xPrime           = cSim.pPosqP[pos];
-        float4 vVector          = cSim.pvVector4[pos];
-        float4 xVector;
-        float4 random4b         = cSim.pRandom4b[rpos + pos];
-        float2 random2b         = cSim.pRandom2b[rpos + pos];
-        float3 Xmh;
-        float sqrtInvMass       = sqrt(velocity.w);
-        velocity.x              = xPrime.x * params[OneOverFix1];
-        velocity.y              = xPrime.y * params[OneOverFix1];
-        velocity.z              = xPrime.z * params[OneOverFix1];
 #ifdef REMOVE_CM
        float mass              = 1.0f / velocity.w;
        CM.x                   += mass * velocity.x;
@@ -181,24 +143,9 @@ void kLangevinUpdatePart2_kernel()
        CM.z                   += mass * velocity.z;
 #endif

-        Xmh.x                   = vVector.x * params[TauDOverEMMinusOne] +
-                                  sqrtInvMass * params[Yx] * random4b.x;
-        Xmh.y                   = vVector.y * params[TauDOverEMMinusOne] +
-                                  sqrtInvMass * params[Yx] * random4b.y;
-        Xmh.z                   = vVector.z * params[TauDOverEMMinusOne] +
-                                  sqrtInvMass * params[Yx] * random4b.z;
-        xVector.x               = sqrtInvMass * params[X] * random4b.w;
-        xVector.y               = sqrtInvMass * params[X] * random2b.x;
-        xVector.z               = sqrtInvMass * params[X] * random2b.y;
-        xPrime.x               += xVector.x - Xmh.x;
-        xPrime.y               += xVector.y - Xmh.y;
-        xPrime.z               += xVector.z - Xmh.z;
-
+        float4 xPrime = make_float4(dt*velocity.x, dt*velocity.y, dt*velocity.z, 0);

        cSim.pPosq[pos]         = xPrime;
-        cSim.pVelm4[pos]        = velocity;
-        cSim.pxVector4[pos]     = xVector;
-
        pos                    += blockDim.x * gridDim.x;
    }


--- a/platforms/cuda/src/kernels/kRandom.cu
+++ b/platforms/cuda/src/kernels/kRandom.cu
@@ -54,14 +54,14 @@ void GetRandomSim(gpuContext gpu)
 extern __shared__ float3 sRand[];


-__global__ 
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_RANDOM_THREADS_PER_BLOCK, 1)
-#elif (__CUDA_ARCH__ >= 130)
-__launch_bounds__(GT2XX_RANDOM_THREADS_PER_BLOCK, 1)
-#else
-__launch_bounds__(G8X_RANDOM_THREADS_PER_BLOCK, 1)
-#endif
+__global__ 
+#if (__CUDA_ARCH__ >= 200)
+__launch_bounds__(GF1XX_RANDOM_THREADS_PER_BLOCK, 1)
+#elif (__CUDA_ARCH__ >= 130)
+__launch_bounds__(GT2XX_RANDOM_THREADS_PER_BLOCK, 1)
+#else
+__launch_bounds__(G8X_RANDOM_THREADS_PER_BLOCK, 1)
+#endif
 void kGenerateRandoms_kernel()
 {
    unsigned int pos            = blockIdx.x * blockDim.x + threadIdx.x;
@@ -73,7 +73,7 @@ void kGenerateRandoms_kernel()
    
    float4 random4;
    float2 random2;
-    while (pos < cSim.totalRandomsTimesTwo)
+    while (pos < cSim.totalRandoms)
    {
        
        // Generate 6 randoms in GRF
@@ -156,10 +156,10 @@ void kGenerateRandoms_kernel()
        random4.y               = sRand[threadIdx.x].y;
        random4.z               = sRand[threadIdx.x].z;
        random4.w               = sRand[threadIdx.x + blockDim.x].x;
-        cSim.pRandom4a[pos]     = random4;
+        cSim.pRandom4[pos]     = random4;
        random2.x               = sRand[threadIdx.x + blockDim.x].y;
        random2.y               = sRand[threadIdx.x + blockDim.x].z;
-        cSim.pRandom2a[pos]     = random2;
+        cSim.pRandom2[pos]     = random2;
        
        pos += increment;
    }