Checked in Cuda code

38f6c8f8 · Peter Eastman · 95d79181 · 38f6c8f8 · 38f6c8f8 · 38f6c8f8
Commit 38f6c8f8 authored Jan 27, 2009 by Peter Eastman
20 changed files
--- a/platforms/cuda/src/kernels/cudaKernels.h
+++ b/platforms/cuda/src/kernels/cudaKernels.h
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "gputypes.h"
+
+// Initialization
+extern void kClearForces(gpuContext gpu);
+extern void kCalculateObcGbsaBornSum(gpuContext gpu);
+extern void kReduceObcGbsaBornSum(gpuContext gpu);
+extern void kGenerateRandoms(gpuContext gpu);
+
+// Main loop
+extern void kCalculateCDLJObcGbsaForces1(gpuContext gpu);
+extern void kCalculateCDLJObcGbsaForces1_12(gpuContext gpu);
+extern void kCalculateCDLJForces(gpuContext gpu);
+extern void kCalculateCDLJForces_12(gpuContext gpu);
+extern void kCalculateObcGbsaForces1(gpuContext gpu);
+extern void kCalculateObcGbsaForces1_12(gpuContext gpu);
+extern void kReduceObcGbsaBornForces(gpuContext gpu);
+extern void kCalculateObcGbsaForces2(gpuContext gpu);
+extern void kCalculateObcGbsaForces2_12(gpuContext gpu);
+extern void kCalculateLocalForces(gpuContext gpu);
+extern void kCalculateAndersenThermostat(gpuContext gpu);
+extern void kReduceBornSumAndForces(gpuContext gpu);
+extern void kUpdatePart1(gpuContext gpu);
+extern void kApplyFirstShake(gpuContext gpu);
+extern void kUpdatePart2(gpuContext gpu);
+extern void kApplySecondShake(gpuContext gpu);
+extern void kVerletUpdatePart1(gpuContext gpu);
+extern void kVerletUpdatePart2(gpuContext gpu);
+extern void kBrownianUpdatePart1(gpuContext gpu);
+extern void kBrownianUpdatePart2(gpuContext gpu);
+
+// Extras
+extern void kReduceForces(gpuContext gpu);
+extern void kClearBornForces(gpuContext gpu);
+
+// Initializers
+extern void SetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu);
+extern void GetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu);
+extern void SetCalculateCDLJObcGbsaForces1_12Sim(gpuContext gpu);
+extern void GetCalculateCDLJObcGbsaForces1_12Sim(gpuContext gpu);
+extern void SetCalculateCDLJForcesSim(gpuContext gpu);
+extern void GetCalculateCDLJForcesSim(gpuContext gpu);
+extern void SetCalculateCDLJForces_12Sim(gpuContext gpu);
+extern void GetCalculateCDLJForces_12Sim(gpuContext gpu);
+extern void SetCalculateLocalForcesSim(gpuContext gpu);
+extern void GetCalculateLocalForcesSim(gpuContext gpu);
+extern void SetCalculateObcGbsaBornSumSim(gpuContext gpu);
+extern void GetCalculateObcGbsaBornSumSim(gpuContext gpu);
+extern void SetCalculateObcGbsaForces1Sim(gpuContext gpu);
+extern void GetCalculateObcGbsaForces1Sim(gpuContext gpu);
+extern void SetCalculateObcGbsaForces1_12Sim(gpuContext gpu);
+extern void GetCalculateObcGbsaForces1_12Sim(gpuContext gpu);
+extern void SetCalculateObcGbsaForces2Sim(gpuContext gpu);
+extern void GetCalculateObcGbsaForces2Sim(gpuContext gpu);
+extern void SetCalculateObcGbsaForces2_12Sim(gpuContext gpu);
+extern void GetCalculateObcGbsaForces2_12Sim(gpuContext gpu);
+extern void SetCalculateAndersenThermostatSim(gpuContext gpu);
+extern void GetCalculateAndersenThermostatSim(gpuContext gpu);
+extern void SetForcesSim(gpuContext gpu);
+extern void GetForcesSim(gpuContext gpu);
+extern void SetUpdateShakeHSim(gpuContext gpu);
+extern void GetUpdateShakeHSim(gpuContext gpu);
+extern void SetVerletUpdateSim(gpuContext gpu);
+extern void GetVerletUpdateSim(gpuContext gpu);
+extern void SetBrownianUpdateSim(gpuContext gpu);
+extern void GetBrownianUpdateSim(gpuContext gpu);
+extern void SetRandomSim(gpuContext gpu);
+extern void GetRandomSim(gpuContext gpu);
--- a/platforms/cuda/src/kernels/cudatypes.h
+++ b/platforms/cuda/src/kernels/cudatypes.h
+#ifndef CUDATYPES_H
+#define CUDATYPES_H
+
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include <stdarg.h>
+#include <limits>
+#include <iostream>
+#include <stdio.h>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include <builtin_types.h>
+#include <vector_functions.h>
+using namespace std;
+
+#define RTERROR(status, s) \
+    if (status != cudaSuccess) { \
+        printf("%s %s\n", s, cudaGetErrorString(status)); \
+        exit(-1); \
+    }
+
+#define LAUNCHERROR(s) \
+    { \
+        cudaError_t status = cudaGetLastError(); \
+        if (status != cudaSuccess) { \
+            printf("Error: %s launching kernel %s\n", cudaGetErrorString(status), s); \
+            exit(-1); \
+        } \
+    }
+
+// Pure virtual class to define an interface for objects resident both on GPU and CPU
+struct SoADeviceObject {
+    virtual void Allocate() = 0;
+    virtual void Deallocate() = 0;
+    virtual void Upload() = 0;
+    virtual void Download() = 0;
+};
+
+template <typename T>
+struct CUDAStream : public SoADeviceObject
+{
+    unsigned int    _length;
+    unsigned int    _subStreams;
+    unsigned int    _stride;
+    T**             _pSysStream;
+    T**             _pDevStream;
+    T*              _pSysData;
+    T*              _pDevData;
+    CUDAStream(int length, int subStreams = 1);
+    CUDAStream(unsigned int length, unsigned int subStreams = 1);
+    CUDAStream(unsigned int length, int subStreams = 1);
+    CUDAStream(int length, unsigned int subStreams = 1);
+    virtual ~CUDAStream();
+    void Allocate();
+    void Deallocate();
+    void Upload();
+    void Download();
+    void Collapse(unsigned int newstreams = 1, unsigned int interleave = 1);
+};
+
+float CompareStreams(CUDAStream<float>& s1, CUDAStream<float>& s2, float tolerance, unsigned int maxindex = 0);
+
+template <typename T>
+CUDAStream<T>::CUDAStream(int length, unsigned int subStreams) : _length(length), _subStreams(subStreams), _stride((length + 0xf) & 0xfffffff0)
+{
+    Allocate();   
+}
+
+template <typename T>
+CUDAStream<T>::CUDAStream(unsigned int length, int subStreams) : _length(length), _subStreams(subStreams), _stride((length + 0xf) & 0xfffffff0)
+{
+    Allocate();   
+}
+
+template <typename T>
+CUDAStream<T>::CUDAStream(unsigned int length, unsigned int subStreams) : _length(length), _subStreams(subStreams), _stride((length + 0xf) & 0xfffffff0)
+{
+    Allocate();   
+}
+
+template <typename T>
+CUDAStream<T>::CUDAStream(int length, int subStreams) : _length(length), _subStreams(subStreams), _stride((length + 0xf) & 0xfffffff0)
+{
+    Allocate();   
+}
+
+template <typename T>
+CUDAStream<T>::~CUDAStream()
+{
+    Deallocate();
+}
+
+template <typename T>
+void CUDAStream<T>::Allocate()
+{
+     cudaError_t status;
+    _pSysStream =   new T*[_subStreams];
+    _pDevStream =   new T*[_subStreams];
+    _pSysData =     new T[_subStreams * _stride];
+
+    status = cudaMalloc((void **) &_pDevData, _stride * _subStreams * sizeof(T));
+    RTERROR(status, "cudaMalloc CUDAStream::Allocate failed");
+
+    for (unsigned int i = 0; i < _subStreams; i++)
+    {
+        _pSysStream[i] = _pSysData + i * _stride;
+        _pDevStream[i] = _pDevData + i * _stride;
+    }
+}
+
+template <typename T>
+void CUDAStream<T>::Deallocate()
+{
+    cudaError_t status;
+    delete[] _pSysStream;
+    _pSysStream = NULL;
+    delete[] _pDevStream;
+    _pDevStream = NULL;
+    delete[] _pSysData;
+    _pSysData = NULL;
+    status = cudaFree(_pDevData);
+    RTERROR(status, "cudaFree CUDAStream::Deallocate failed");    
+}
+
+template <typename T>
+void CUDAStream<T>::Upload()
+{
+    cudaError_t status;
+    status = cudaMemcpy(_pDevData, _pSysData, _stride * _subStreams * sizeof(T), cudaMemcpyHostToDevice);
+    RTERROR(status, "cudaMemcpy CUDAStream::Upload failed");
+}
+
+template <typename T>
+void CUDAStream<T>::Download()
+{
+    cudaError_t status;
+    status = cudaMemcpy(_pSysData, _pDevData, _stride * _subStreams * sizeof(T), cudaMemcpyDeviceToHost);
+    RTERROR(status, "cudaMemcpy CUDAStream::Download failed");
+}
+
+template <typename T>
+void CUDAStream<T>::Collapse(unsigned int newstreams, unsigned int interleave)
+{
+    T* pTemp = new T[_subStreams * _stride];
+    unsigned int stream = 0;
+    unsigned int pos = 0;
+    unsigned int newstride = _stride * _subStreams / newstreams;
+    unsigned int newlength = _length * _subStreams / newstreams;
+
+    // Copy data into new format
+    for (unsigned int i = 0; i < _length; i++)
+    {
+        for (unsigned int j = 0; j < _subStreams; j++)
+        {
+            pTemp[stream * newstride + pos] = _pSysStream[j][i];
+            stream++;
+            if (stream == newstreams)
+            {
+                stream = 0;
+                pos++;
+            }
+        }
+    }
+
+    // Remap stream pointers;
+    for (unsigned int i = 0; i < newstreams; i++)
+    {
+        _pSysStream[i] = _pSysData + i * newstride;
+        _pDevStream[i] = _pDevData + i * newstride;
+    }
+
+    // Copy data back intro original stream
+    for (unsigned int i = 0; i < newlength; i++)
+        for (unsigned int j = 0; j < newstreams; j++)
+            _pSysStream[j][i] = pTemp[j * newstride + i];
+    
+    _stride = newstride;
+    _length = newlength;
+    _subStreams = newstreams;
+    delete[] pTemp;
+}
+
+static const int GRID = 32;
+static const int GRIDBITS = 5;
+static const int G8X_NONBOND_THREADS_PER_BLOCK          = 256;
+static const int GT2XX_NONBOND_THREADS_PER_BLOCK        = 320;
+static const int G8X_BORNFORCE2_THREADS_PER_BLOCK       = 256;
+static const int GT2XX_BORNFORCE2_THREADS_PER_BLOCK     = 320;
+static const int G8X_SHAKE_THREADS_PER_BLOCK            = 128;
+static const int GT2XX_SHAKE_THREADS_PER_BLOCK          = 256;
+static const int G8X_UPDATE_THREADS_PER_BLOCK           = 192;
+static const int GT2XX_UPDATE_THREADS_PER_BLOCK         = 384;
+static const int G8X_LOCALFORCES_THREADS_PER_BLOCK      = 192;
+static const int GT2XX_LOCALFORCES_THREADS_PER_BLOCK    = 384;
+static const int G8X_THREADS_PER_BLOCK                  = 256;
+static const int GT2XX_THREADS_PER_BLOCK                = 256;
+static const int G8X_RANDOM_THREADS_PER_BLOCK           = 256;
+static const int GT2XX_RANDOM_THREADS_PER_BLOCK         = 384;
+static const int G8X_NONBOND_WORKUNITS_PER_SM           = 220;
+static const int GT2XX_NONBOND_WORKUNITS_PER_SM         = 256;
+
+
+struct cudaGmxSimulation {
+    // Constants
+    unsigned int    atoms;                          // Number of atoms
+    unsigned int    paddedNumberOfAtoms;            // Padded number of atoms
+    unsigned int    blocks;                         // Number of blocks to launch across linear kernels
+    unsigned int    nonbond_blocks;                 // Number of blocks to launch across CDLJ and Born Force Part1
+    unsigned int    bornForce2_blocks;              // Number of blocks to launch across Born Force 2
+    unsigned int    threads_per_block;              // Threads per block to launch
+    unsigned int    nonbond_threads_per_block;      // Threads per block in nonbond kernel calls
+    unsigned int    bornForce2_threads_per_block;   // Threads per block in nonbond kernel calls
+    unsigned int    max_update_threads_per_block;   // Maximum threads per block in update kernel calls
+    unsigned int    update_threads_per_block;       // Threads per block in update kernel calls
+    unsigned int    bf_reduce_threads_per_block;    // Threads per block in Born Force reduction calls
+    unsigned int    bsf_reduce_threads_per_block;   // Threads per block in Born Sum And Forces reduction calls
+    unsigned int    max_shake_threads_per_block;    // Maximum threads per block in shake kernel calls
+    unsigned int    shake_threads_per_block;        // Threads per block in shake kernel calls
+    unsigned int    nonshake_threads_per_block;     // Threads per block in nonshaking kernel call
+    unsigned int    max_localForces_threads_per_block;  // Threads per block in local forces kernel calls
+    unsigned int    localForces_threads_per_block;  // Threads per block in local forces kernel calls
+    unsigned int    random_threads_per_block;       // Threads per block in RNG kernel calls
+    unsigned int    workUnits;                      // Number of work units
+    unsigned int*   pWorkUnit;                      // Pointer to work units
+    unsigned int    nonbond_workBlock;              // Number of work units running simultaneously per block in CDLJ and Born Force Part 1
+    unsigned int    bornForce2_workBlock;           // Number of work units running second half of Born Forces calculation
+    unsigned int    workUnitsPerSM;                 // Number of workblocks per SM
+    unsigned int    nbWorkUnitsPerBlock;            // Number of work units assigned to each nonbond block
+    unsigned int    nbWorkUnitsPerBlockRemainder;   // Remainder of work units to assign across lower numbered nonbond blocks
+    unsigned int    bf2WorkUnitsPerBlock;           // Number of work units assigned to each bornForce2 block
+    unsigned int    bf2WorkUnitsPerBlockRemainder;  // Remainder of work units to assign across lower numbered bornForce2 blocks
+
+
+    unsigned int    stride;                         // Atomic attributes stride
+    unsigned int    stride2;                        // Atomic attributes stride x 2
+    unsigned int    stride3;                        // Atomic attributes stride x 3
+    unsigned int    stride4;                        // Atomic attributes stride x 4
+    unsigned int    exclusionStride;                // Exclusion list stride = stride / GRID
+    unsigned int	nonbondOutputBuffers;           // Nonbond output buffers per nonbond call
+    unsigned int    totalNonbondOutputBuffers;      // Total nonbond output buffers
+    unsigned int    outputBuffers;                  // Number of output buffers
+    float           bigFloat;                       // Floating point value used as a flag for Shaken atoms 
+    float           epsfac;                         // Epsilon factor for CDLJ calculations
+    float           probeRadius;                    // SASA probe radius
+    float           surfaceAreaFactor;              // ACE approximation surface area factor
+    float           electricConstant;               // ACE approximation electric constant
+    float           forceConversionFactor;          // kJ to kcal force conversion factor
+    float           preFactor;                      // Born electrostatic pre-factor
+    float			dielectricOffset;		        // Born dielectric offset
+    float			alphaOBC;				        // OBC alpha factor
+    float			betaOBC;				        // OBC beta factor
+    float			gammaOBC;				        // OBC gamma factor
+    float           deltaT;                         // Molecular dynamics deltaT constant
+    float           oneOverDeltaT;                  // 1/deltaT
+    float           B;                              // Molecular dynamics B constant
+    float           C;                              // Molecular dynamics C constant
+    float           D;                              // Molecular dynamics D constant
+    float           EPH;                            // Molecular dynamics EPH constant
+    float           EMH;                            // Molecular dynamics EMH constant
+    float           EM;                             // Molecular dynamics EM constant
+    float           EP;                             // Molecular dynamics EP constant
+    float           GDT;                            // Molecular dynamics GDT constant
+    float           OneMinusEM;                     // Molecular dynamics OneMinusEM constant
+    float           TauOneMinusEM;                  // Molecular dynamics TauOneMinusEM constant
+    float           TauDOverEMMinusOne;             // Molecular dynamics TauDOverEMMinusOne constant
+    float           T;                              // Molecular dynamics T constant
+    float           kT;                             // Boltzmann's constant times T
+    float           V;                              // Molecular dynamics V constant
+    float           X;                              // Molecular dynamics X constant
+    float           Yv;                             // Molecular dynamics Yv constant
+    float           Yx;                             // Molecular dynamics Yx constant
+    float           tau;                            // Molecular dynamics tau constant
+    float           fix1;                           // Molecular dynamics fix1 constant
+    float           oneOverFix1;                    // Molecular dynamics reciprocal of fix1 constant
+    float           DOverTauC;                      // Molecular dynamics DOverTauC constant
+    float           collisionProbability;           // Collision probability for Andersen thermostat
+    float2*         pObcData;                       // Pointer to fixed Born data
+    float2*         pAttr;                          // Pointer to additional atom attributes (sig, eps)
+    unsigned int    bonds;                          // Number of bonds
+    int4*           pBondID;                        // Bond atom and output buffer IDs
+    float2*         pBondParameter;                 // Bond parameters
+    unsigned int    bond_angles;                    // Number of bond angles
+    int4*           pBondAngleID1;                  // Bond angle atom and first output buffer IDs
+    int2*           pBondAngleID2;                  // Bond angle output buffer IDs
+    float2*         pBondAngleParameter;            // Bond angle parameters
+    unsigned int    dihedrals;                      // Number of dihedrals
+    int4*           pDihedralID1;                   // Dihedral IDs
+    int4*           pDihedralID2;                   // Dihedral output buffer IDs
+    float4*         pDihedralParameter;             // Dihedral parameters
+    unsigned int    rb_dihedrals;                   // Number of Ryckaert Bellemans dihedrals
+    int4*           pRbDihedralID1;                 // Ryckaert Bellemans Dihedral IDs
+    int4*           pRbDihedralID2;                 // Ryckaert Bellemans Dihedral output buffer IDs
+    float4*         pRbDihedralParameter1;          // Ryckaert Bellemans Dihedral parameters
+    float2*         pRbDihedralParameter2;          // Ryckaert Bellemans Dihedral parameters
+    unsigned int    LJ14s;                          // Number of Lennard Jones 1-4 interactions
+    int4*           pLJ14ID;                        // Lennard Jones 1-4 atom and output buffer IDs
+    float4*         pLJ14Parameter;                 // Lennard Jones 1-4 parameters
+    float           inverseTotalMass;               // Used in linear momentum removal
+    unsigned int    ShakeConstraints;               // Total number of Shake constraints
+    unsigned int    NonShakeConstraints;            // Total number of NonShake atoms
+    unsigned int    maxShakeIterations;             // Maximum shake iterations
+    unsigned int    degreesOfFreedom;               // Number of degrees of freedom in system
+    float           shakeTolerance;                 // Shake tolerance
+    float           InvMassJ;                       // Shake inverse mass for hydrogens
+    int*            pNonShakeID;                    // Not Shaking atoms
+    int4*           pShakeID;                       // Shake atoms and phase
+    float4*         pShakeParameter;                // Shake parameters
+    unsigned int*   pExclusion;                     // Nonbond exclusion data
+    unsigned int    bond_offset;                    // Offset to end of bonds
+    unsigned int    bond_angle_offset;              // Offset to end of bond angles
+    unsigned int    dihedral_offset;                // Offset to end of dihedrals
+    unsigned int    rb_dihedral_offset;             // Offset to end of Ryckaert Bellemans dihedrals
+    unsigned int    LJ14_offset;                    // Offset to end of Lennard Jones 1-4 parameters
+
+    // Mutable stuff
+    float4*         pPosq;                          // Pointer to atom positions and charges
+    float4*         pPosqP;                         // Pointer to mid-integration atom positions
+    float4*         pOldPosq;                       // Pointer to old atom positions
+    float4*         pVelm4;                         // Pointer to atom velocity and inverse mass
+    float4*         pvVector4;                      // Pointer to atom v Vector
+    float4*         pxVector4;                      // Pointer to atom x Vector
+    float4*         pForce4;                        // Pointer to all force4 data
+    float4*         pForce4a;                       // Pointer to first set of force4 data
+    float4*         pForce4b;                       // Pointer to second set of force4 data
+    float4*         pOutForce4;                     // Pointer to output float4 force
+    float*          pBornForce;                     // Pointer to Born force data
+    float*			pBornSum;                       // Pointer to Born Radii calculation output buffers
+    float*			pBornRadii;				        // Pointer to Born Radii
+    float*          pObcChain;                      // Pointer to OBC chain data
+    float4*         pLinearMomentum;                // Pointer to linear momentum
+    
+    // Random numbers
+    float4*         pRandom4a;                      // Pointer to first set of 4 random numbers
+    float4*         pRandom4b;                      // Pointer to second set of 4 random numbers
+    float2*         pRandom2a;                      // Pointer to first set of 2 random numbers
+    float2*         pRandom2b;                      // Pointer to second set of 2 random numbers
+    uint4*          pRandomSeed;                    // Pointer to random seeds
+    int*            pRandomPosition;                // Pointer to random number positions
+    unsigned int    randoms;                        // Number of randoms
+    unsigned int    totalRandoms;                   // Number of randoms plus overflow.
+    unsigned int    totalRandomsTimesTwo;           // Used for generating randoms
+    unsigned int    randomIterations;               // Number of iterations before regenerating randoms
+    unsigned int    randomFrames;                   // Number of frames of random numbers
+};
+
+struct Vectors {
+    float3 v0;
+    float3 v1;
+    float3 v2;
+};
+
+#endif
--- a/platforms/cuda/src/kernels/gpu.cpp
+++ b/platforms/cuda/src/kernels/gpu.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#include <cuda.h>
+#include <vector_functions.h>
+#include <cstdlib>
+#include <string>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <ctime>
+#include <cmath>
+#include <map>
+#ifdef WIN32
+  #include <windows.h>
+#else
+  #include <stdint.h>
+#endif
+using namespace std;
+
+#include "gputypes.h"
+#include "cudaKernels.h"
+#include "OpenMMException.h"
+
+using OpenMM::OpenMMException;
+
+#ifdef WIN32
+  typedef unsigned __int64 u64;
+  typedef signed __int64 s64;
+#else
+  typedef uint64_t u64;
+  typedef int64_t s64;
+#endif
+typedef unsigned int u32;
+typedef float f32;
+typedef double f64;
+typedef char ascii;
+typedef char utf8;
+typedef unsigned char u8;
+typedef signed char s8;
+typedef unsigned short u16;
+typedef signed short s16;
+typedef struct
+{
+  u8 type[4];
+  f32 charge;
+  f32 radius;
+} FAH_ATOM;
+
+typedef struct
+{
+  u32 a; /* rule: a < b */
+  u32 b;
+} FAH_BOND;
+
+typedef struct
+{
+  f32 x;
+  f32 y;
+  f32 z;
+} FAH_XYZ;
+
+typedef struct
+{
+  u32    magic;
+  u32    version;
+  utf8   name[64];
+  s64    timestamp;
+  u64    iterations;
+  u32    frames;
+  u32    atom_count;
+  u32    bond_count;
+  /* v2 */
+  utf8   user_name[64];
+  utf8   user_team[16];
+  utf8   user_done[16];
+} FAH_INFO;
+
+typedef struct
+{
+  u32    magic;
+  u32    version;
+  s64    timestamp;
+  u64    iterations_done;
+  u32    frames_done;
+  f32    energy;
+  f32    temperature;
+} FAH_CURRENT;
+
+typedef struct
+{
+  FAH_INFO info;
+  FAH_CURRENT current;
+  FAH_ATOM * atoms;
+  FAH_BOND * bonds;
+  FAH_XYZ  * xyz;
+} PROTEIN;
+
+struct ShakeCluster {
+    int centralID;
+    int peripheralID[3];
+    int size;
+    float distance;
+    float centralInvMass, peripheralInvMass;
+    ShakeCluster() {
+    }
+    ShakeCluster(int centralID, float invMass) : centralID(centralID), centralInvMass(invMass), size(0) {
+    }
+    void addAtom(int id, float dist, float invMass) {
+        if (size == 3)
+            throw OpenMMException("A single atom may only have three constraints");
+        if (size > 0 && dist != distance)
+            throw OpenMMException("All constraints for a central atom must have the same distance");
+        if (size > 0 && invMass != peripheralInvMass)
+            throw OpenMMException("All constraints for a central atom must have the same mass");
+        peripheralID[size++] = id;
+        distance = dist;
+        peripheralInvMass = invMass;
+    }
+};
+
+static const float dielectricOffset         =    0.009f;
+static const float PI                       =    3.1415926535f;
+static const float probeRadius              =    0.14f;
+static const float forceConversionFactor    =    0.4184f;
+
+//static const float surfaceAreaFactor        =   -6.0f * 0.06786f * forceConversionFactor * 1000.0f;  // PI * 4.0f * 0.0049f * 1000.0f;
+//static const float surfaceAreaFactor        =   -6.0f * PI * 4.0f * 0.0049f * 1000.0f;
+static const float surfaceAreaFactor        = -6.0f*PI*0.0216f*1000.0f*0.4184f;
+//static const float surfaceAreaFactor        = -1.7035573959e+001;
+//static const float surfaceAreaFactor        = -166.02691f;
+//static const float surfaceAreaFactor        = 1.0f;
+
+static const float alphaOBC                 =    1.0f;
+static const float betaOBC                  =    0.8f;
+static const float gammaOBC                 =    4.85f;
+static const float kcalMolTokJNM            =   -0.4184f;
+static const float electricConstant         = -166.02691f;
+static const float defaultInnerDielectric   =    1.0f;
+static const float defaultSolventDielectric =   78.3f;
+static const float KILO                     =    1e3;                      // Thousand
+static const float BOLTZMANN                =    1.380658e-23f;            // (J/K)    
+static const float AVOGADRO                 =    6.0221367e23f;            // ()        
+static const float RGAS                     =    BOLTZMANN * AVOGADRO;     // (J/(mol K))
+static const float BOLTZ                    =    (RGAS / KILO);            // (kJ/(mol K)) 
+
+#define DUMP_PARAMETERS 0
+
+#define DeltaShake
+
+
+extern "C"
+int gpuReadBondParameters(gpuContext gpu, char* fname)
+{
+    ifstream infile(fname);
+    
+    if (!infile.fail())
+    {
+        char buff[512];
+        int bonds;
+        infile >> bonds;
+        infile.getline(buff, 512);
+        vector<int> atom1(bonds);
+        vector<int> atom2(bonds);
+        vector<float> length(bonds);
+        vector<float> k(bonds);
+        for (int i = 0; i < bonds; i++)
+        {
+            int junk;
+            infile >> 
+                junk >> 
+                atom1[i] >> 
+                atom2[i] >> 
+                length[i] >> 
+                k[i];
+        }
+        gpuSetBondParameters(gpu, atom1, atom2, length, k);
+        return bonds;
+    }
+    else
+    {
+        cout << "Error opening harmonic bond parameter file " << fname << endl;
+        exit(-1);
+    }
+    return 0;
+}
+
+extern "C"
+void gpuSetBondParameters(gpuContext gpu, const vector<int>& atom1, const vector<int>& atom2, const vector<float>& length, const vector<float>& k)
+{
+    int bonds = atom1.size();
+    gpu->sim.bonds                              = bonds;
+    CUDAStream<int4>* psBondID                  = new CUDAStream<int4>(bonds, 1);
+    gpu->psBondID                               = psBondID;
+    gpu->sim.pBondID                            = psBondID->_pDevStream[0];
+    CUDAStream<float2>* psBondParameter         = new CUDAStream<float2>(bonds, 1);
+    gpu->psBondParameter                        = psBondParameter;
+    gpu->sim.pBondParameter                     = psBondParameter->_pDevStream[0];
+    for (int i = 0; i < bonds; i++)
+    {
+        psBondID->_pSysStream[0][i].x = atom1[i];
+        psBondID->_pSysStream[0][i].y = atom2[i];
+        psBondParameter->_pSysStream[0][i].x = length[i];
+        psBondParameter->_pSysStream[0][i].y = k[i];
+        psBondID->_pSysStream[0][i].z = gpu->pOutputBufferCounter[psBondID->_pSysStream[0][i].x]++;
+        psBondID->_pSysStream[0][i].w = gpu->pOutputBufferCounter[psBondID->_pSysStream[0][i].y]++;
+#if (DUMP_PARAMETERS == 1)                
+        cout << 
+            i << " " << 
+            psBondID->_pSysStream[0][i].x << " " << 
+            psBondID->_pSysStream[0][i].y << " " << 
+            psBondID->_pSysStream[0][i].z << " " << 
+            psBondID->_pSysStream[0][i].w << " " << 
+            psBondParameter->_pSysStream[0][i].x << " " << 
+            psBondParameter->_pSysStream[0][i].y << 
+            endl;
+#endif
+    }
+    psBondID->Upload();
+    psBondParameter->Upload();
+}
+
+extern "C"
+int gpuReadBondAngleParameters(gpuContext gpu, char* fname)
+{
+    ifstream infile(fname);
+    
+    if (!infile.fail())
+    {
+        char buff[512];
+        int bond_angles;
+        infile >> bond_angles;
+        infile.getline(buff, 512);
+        vector<int> atom1(bond_angles);
+        vector<int> atom2(bond_angles);
+        vector<int> atom3(bond_angles);
+        vector<float> angle(bond_angles);
+        vector<float> k(bond_angles);
+     
+        for (int i = 0; i < bond_angles; i++)
+        {
+            int junk;
+            infile >> 
+                junk >> 
+                atom1[i] >> 
+                atom2[i] >> 
+                atom3[i] >> 
+                angle[i] >> 
+                k[i];
+        }
+        gpuSetBondAngleParameters(gpu, atom1, atom2, atom3, angle, k);
+        return bond_angles;
+    }
+    else
+    {
+        cout << "Error opening harmonic bond angle parameter file " << fname << endl;
+        exit(-1);
+    }
+    return 0;
+}
+
+extern "C"
+void gpuSetBondAngleParameters(gpuContext gpu, const vector<int>& atom1, const vector<int>& atom2, const vector<int>& atom3,
+        const vector<float>& angle, const vector<float>& k)
+{
+    int bond_angles = atom1.size();
+    gpu->sim.bond_angles                        = bond_angles;
+    CUDAStream<int4>* psBondAngleID1            = new CUDAStream<int4>(bond_angles, 1);
+    gpu->psBondAngleID1                         = psBondAngleID1;
+    gpu->sim.pBondAngleID1                      = psBondAngleID1->_pDevStream[0];
+    CUDAStream<int2>* psBondAngleID2            = new CUDAStream<int2>(bond_angles, 1);
+    gpu->psBondAngleID2                         = psBondAngleID2;
+    gpu->sim.pBondAngleID2                      = psBondAngleID2->_pDevStream[0];
+    CUDAStream<float2>* psBondAngleParameter    = new CUDAStream<float2>(bond_angles, 1);
+    gpu->psBondAngleParameter                   = psBondAngleParameter;
+    gpu->sim.pBondAngleParameter                = psBondAngleParameter->_pDevStream[0];        
+
+    for (int i = 0; i < bond_angles; i++)
+    {
+        psBondAngleID1->_pSysStream[0][i].x = atom1[i];
+        psBondAngleID1->_pSysStream[0][i].y = atom2[i];
+        psBondAngleID1->_pSysStream[0][i].z = atom3[i];
+        psBondAngleParameter->_pSysStream[0][i].x = angle[i];
+        psBondAngleParameter->_pSysStream[0][i].y = k[i];
+        psBondAngleID1->_pSysStream[0][i].w = gpu->pOutputBufferCounter[psBondAngleID1->_pSysStream[0][i].x]++;
+        psBondAngleID2->_pSysStream[0][i].x = gpu->pOutputBufferCounter[psBondAngleID1->_pSysStream[0][i].y]++;
+        psBondAngleID2->_pSysStream[0][i].y = gpu->pOutputBufferCounter[psBondAngleID1->_pSysStream[0][i].z]++;
+#if (DUMP_PARAMETERS == 1)
+         cout << 
+            i << " " << 
+            psBondAngleID1->_pSysStream[0][i].x << " " << 
+            psBondAngleID1->_pSysStream[0][i].y << " " << 
+            psBondAngleID1->_pSysStream[0][i].z << " " << 
+            psBondAngleID1->_pSysStream[0][i].w << " " << 
+            psBondAngleID2->_pSysStream[0][i].x << " " << 
+            psBondAngleID2->_pSysStream[0][i].y << " " << 
+            psBondAngleParameter->_pSysStream[0][i].x << " " << 
+            psBondAngleParameter->_pSysStream[0][i].y << 
+            endl;
+#endif
+    }
+    psBondAngleID1->Upload();
+    psBondAngleID2->Upload();
+    psBondAngleParameter->Upload();
+}
+
+extern "C"
+int gpuReadDihedralParameters(gpuContext gpu, char* fname)
+{
+    ifstream infile(fname);
+    
+    if (!infile.fail())
+    {
+        char buff[512];
+        int dihedrals;
+        infile >> dihedrals;
+        infile.getline(buff, 512);
+        vector<int> atom1(dihedrals);
+        vector<int> atom2(dihedrals);
+        vector<int> atom3(dihedrals);
+        vector<int> atom4(dihedrals);
+        vector<float> k(dihedrals);
+        vector<float> phase(dihedrals);
+        vector<int> periodicity(dihedrals);
+        for (int i = 0; i < dihedrals; i++)
+        {
+            int junk;
+            infile >> 
+                junk >> 
+                atom1[i] >> 
+                atom2[i] >> 
+                atom3[i] >>
+                atom4[i] >> 
+                k[i] >> 
+                phase[i] >>
+                periodicity[i];
+        }
+        gpuSetDihedralParameters(gpu, atom1, atom2, atom3, atom4, k, phase, periodicity);
+        return dihedrals;
+    }
+    else
+    {
+        cout << "Error opening dihedral parameter file " << fname << endl;
+        exit(-1);
+    }
+    return 0;
+}
+
+extern "C"
+void gpuSetDihedralParameters(gpuContext gpu, const vector<int>& atom1, const vector<int>& atom2, const vector<int>& atom3, const vector<int>& atom4,
+        const vector<float>& k, const vector<float>& phase, const vector<int>& periodicity)
+{
+        int dihedrals = atom1.size();
+        gpu->sim.dihedrals = dihedrals;
+        CUDAStream<int4>* psDihedralID1             = new CUDAStream<int4>(dihedrals, 1);
+        gpu->psDihedralID1                          = psDihedralID1;
+        gpu->sim.pDihedralID1                       = psDihedralID1->_pDevStream[0];
+        CUDAStream<int4>* psDihedralID2             = new CUDAStream<int4>(dihedrals, 1);
+        gpu->psDihedralID2                          = psDihedralID2;
+        gpu->sim.pDihedralID2                       = psDihedralID2->_pDevStream[0];
+        CUDAStream<float4>* psDihedralParameter     = new CUDAStream<float4>(dihedrals, 1);
+        gpu->psDihedralParameter                    = psDihedralParameter;
+        gpu->sim.pDihedralParameter                 = psDihedralParameter->_pDevStream[0];
+        for (int i = 0; i < dihedrals; i++)
+        {
+            psDihedralID1->_pSysStream[0][i].x = atom1[i];
+            psDihedralID1->_pSysStream[0][i].y = atom2[i];
+            psDihedralID1->_pSysStream[0][i].z = atom3[i];
+            psDihedralID1->_pSysStream[0][i].w = atom4[i];
+            psDihedralParameter->_pSysStream[0][i].x = k[i];
+            psDihedralParameter->_pSysStream[0][i].y = phase[i];
+            psDihedralParameter->_pSysStream[0][i].z = (float) periodicity[i];
+            psDihedralID2->_pSysStream[0][i].x = gpu->pOutputBufferCounter[psDihedralID1->_pSysStream[0][i].x]++;
+            psDihedralID2->_pSysStream[0][i].y = gpu->pOutputBufferCounter[psDihedralID1->_pSysStream[0][i].y]++;
+            psDihedralID2->_pSysStream[0][i].z = gpu->pOutputBufferCounter[psDihedralID1->_pSysStream[0][i].z]++;
+            psDihedralID2->_pSysStream[0][i].w = gpu->pOutputBufferCounter[psDihedralID1->_pSysStream[0][i].w]++;
+#if (DUMP_PARAMETERS == 1)
+            cout << 
+                i << " " << 
+                psDihedralID1->_pSysStream[0][i].x << " " << 
+                psDihedralID1->_pSysStream[0][i].y << " " << 
+                psDihedralID1->_pSysStream[0][i].z << " " << 
+                psDihedralID1->_pSysStream[0][i].w << " " << 
+                psDihedralID2->_pSysStream[0][i].x << " " << 
+                psDihedralID2->_pSysStream[0][i].y << " " << 
+                psDihedralID2->_pSysStream[0][i].z << " " << 
+                psDihedralID2->_pSysStream[0][i].w << " " << 
+                psDihedralParameter->_pSysStream[0][i].x << " " << 
+                psDihedralParameter->_pSysStream[0][i].y << " " << 
+                psDihedralParameter->_pSysStream[0][i].z << endl;
+#endif
+        }
+        psDihedralID1->Upload();
+        psDihedralID2->Upload();
+        psDihedralParameter->Upload();
+}
+
+extern "C"
+int gpuReadRbDihedralParameters(gpuContext gpu, char* fname)
+{
+    ifstream infile(fname);
+    
+    if (!infile.fail())
+    {
+        char buff[512];
+        int rb_dihedrals;
+        infile >> rb_dihedrals;
+        infile.getline(buff, 512);
+        vector<int> atom1(rb_dihedrals);
+        vector<int> atom2(rb_dihedrals);
+        vector<int> atom3(rb_dihedrals);
+        vector<int> atom4(rb_dihedrals);
+        vector<float> c0(rb_dihedrals);
+        vector<float> c1(rb_dihedrals);
+        vector<float> c2(rb_dihedrals);
+        vector<float> c3(rb_dihedrals);
+        vector<float> c4(rb_dihedrals);
+        vector<float> c5(rb_dihedrals);
+        gpu->sim.rb_dihedrals = rb_dihedrals;
+        CUDAStream<int4>* psRbDihedralID1           = new CUDAStream<int4>(rb_dihedrals, 1);
+        gpu->psRbDihedralID1                        = psRbDihedralID1;
+        gpu->sim.pRbDihedralID1                     = psRbDihedralID1->_pDevStream[0];
+        CUDAStream<int4>* psRbDihedralID2           = new CUDAStream<int4>(rb_dihedrals, 1);
+        gpu->psRbDihedralID2                        = psRbDihedralID2;
+        gpu->sim.pRbDihedralID2                     = psRbDihedralID2->_pDevStream[0];
+        CUDAStream<float4>* psRbDihedralParameter1  = new CUDAStream<float4>(rb_dihedrals, 1);
+        gpu->psRbDihedralParameter1                 = psRbDihedralParameter1;
+        gpu->sim.pRbDihedralParameter1              = psRbDihedralParameter1->_pDevStream[0];
+        CUDAStream<float2>* psRbDihedralParameter2  = new CUDAStream<float2>(rb_dihedrals, 1);    
+        gpu->psRbDihedralParameter2                 = psRbDihedralParameter2;
+        gpu->sim.pRbDihedralParameter2              = psRbDihedralParameter2->_pDevStream[0];
+        
+        for (int i = 0; i < rb_dihedrals; i++)
+        {
+            int junk;
+            infile >> 
+                junk >> 
+                atom1[i] >> 
+                atom2[i] >> 
+                atom3[i] >>
+                atom4[i] >> 
+                c0[i] >> 
+                c1[i] >> 
+                c2[i] >> 
+                c3[i] >> 
+                c4[i] >> 
+                c5[i];
+        }
+        gpuSetRbDihedralParameters(gpu, atom1, atom2, atom3, atom4, c0, c1, c2, c3, c4, c5);
+        return rb_dihedrals;
+    }
+    else
+    {
+        cout << "Error opening Ryckaert-Bellemans dihedral parameter file " << fname << endl;
+        exit(-1);
+    }
+    return 0;
+}
+
+extern "C"
+void gpuSetRbDihedralParameters(gpuContext gpu, const vector<int>& atom1, const vector<int>& atom2, const vector<int>& atom3, const vector<int>& atom4,
+        const vector<float>& c0, const vector<float>& c1, const vector<float>& c2, const vector<float>& c3, const vector<float>& c4, const vector<float>& c5)
+{
+    int rb_dihedrals = atom1.size();
+    gpu->sim.rb_dihedrals = rb_dihedrals;
+    CUDAStream<int4>* psRbDihedralID1           = new CUDAStream<int4>(rb_dihedrals, 1);
+    gpu->psRbDihedralID1                        = psRbDihedralID1;
+    gpu->sim.pRbDihedralID1                     = psRbDihedralID1->_pDevStream[0];
+    CUDAStream<int4>* psRbDihedralID2           = new CUDAStream<int4>(rb_dihedrals, 1);
+    gpu->psRbDihedralID2                        = psRbDihedralID2;
+    gpu->sim.pRbDihedralID2                     = psRbDihedralID2->_pDevStream[0];
+    CUDAStream<float4>* psRbDihedralParameter1  = new CUDAStream<float4>(rb_dihedrals, 1);
+    gpu->psRbDihedralParameter1                 = psRbDihedralParameter1;
+    gpu->sim.pRbDihedralParameter1              = psRbDihedralParameter1->_pDevStream[0];
+    CUDAStream<float2>* psRbDihedralParameter2  = new CUDAStream<float2>(rb_dihedrals, 1);    
+    gpu->psRbDihedralParameter2                 = psRbDihedralParameter2;
+    gpu->sim.pRbDihedralParameter2              = psRbDihedralParameter2->_pDevStream[0];
+
+    for (int i = 0; i < rb_dihedrals; i++)
+    {
+        psRbDihedralID1->_pSysStream[0][i].x = atom1[i];
+        psRbDihedralID1->_pSysStream[0][i].y = atom2[i];
+        psRbDihedralID1->_pSysStream[0][i].z = atom3[i];
+        psRbDihedralID1->_pSysStream[0][i].w = atom4[i];
+        psRbDihedralParameter1->_pSysStream[0][i].x = c0[i];
+        psRbDihedralParameter1->_pSysStream[0][i].y = c1[i];
+        psRbDihedralParameter1->_pSysStream[0][i].z = c2[i];
+        psRbDihedralParameter1->_pSysStream[0][i].w = c3[i];
+        psRbDihedralParameter2->_pSysStream[0][i].x = c4[i];
+        psRbDihedralParameter2->_pSysStream[0][i].y = c5[i];
+        psRbDihedralID2->_pSysStream[0][i].x = gpu->pOutputBufferCounter[psRbDihedralID1->_pSysStream[0][i].x]++;
+        psRbDihedralID2->_pSysStream[0][i].y = gpu->pOutputBufferCounter[psRbDihedralID1->_pSysStream[0][i].y]++;
+        psRbDihedralID2->_pSysStream[0][i].z = gpu->pOutputBufferCounter[psRbDihedralID1->_pSysStream[0][i].z]++;
+        psRbDihedralID2->_pSysStream[0][i].w = gpu->pOutputBufferCounter[psRbDihedralID1->_pSysStream[0][i].w]++;
+#if (DUMP_PARAMETERS == 1)
+        cout << 
+            i << " " << 
+            psRbDihedralID1->_pSysStream[0][i].x << " " << 
+            psRbDihedralID1->_pSysStream[0][i].y << " " << 
+            psRbDihedralID1->_pSysStream[0][i].z << " " << 
+            psRbDihedralID1->_pSysStream[0][i].w <<" " << 
+            psRbDihedralID2->_pSysStream[0][i].x << " " << 
+            psRbDihedralID2->_pSysStream[0][i].y << " " << 
+            psRbDihedralID2->_pSysStream[0][i].z << " " << 
+            psRbDihedralID2->_pSysStream[0][i].w <<" " <<                 
+            psRbDihedralParameter1->_pSysStream[0][i].x << " " << 
+            psRbDihedralParameter1->_pSysStream[0][i].y << " " << 
+            psRbDihedralParameter1->_pSysStream[0][i].z << " " << 
+            psRbDihedralParameter1->_pSysStream[0][i].w << " " << 
+            psRbDihedralParameter2->_pSysStream[0][i].x << " " << 
+            psRbDihedralParameter2->_pSysStream[0][i].y << 
+            endl;
+#endif
+    }
+    psRbDihedralID1->Upload();
+    psRbDihedralID2->Upload();
+    psRbDihedralParameter1->Upload();
+    psRbDihedralParameter2->Upload();
+}
+
+extern "C"
+int gpuReadLJ14Parameters(gpuContext gpu, char* fname)
+{
+    ifstream infile(fname);
+    
+    if (!infile.fail())
+    {
+        char buff[1024];
+        float epsfac = 0.0f;
+        float fudge = 0.0f;
+        int LJ14s;
+        infile >> LJ14s;
+        infile.get(buff, 61);
+        // cout << buff << endl;
+        infile >> epsfac;
+        infile.get(buff, 8);
+        infile >> fudge;
+        infile.getline(buff, 512);
+        // cout << buff << endl;
+        
+        vector<int> atom1(LJ14s);
+        vector<int> atom2(LJ14s);
+        vector<float> c6(LJ14s);
+        vector<float> c12(LJ14s);
+        vector<float> q1(LJ14s);
+        vector<float> q2(LJ14s);
+ 
+        for (int i = 0; i < LJ14s; i++)
+        {
+            int junk;
+            infile >> 
+                junk >> 
+                atom1[i] >> 
+                atom2[i] >> 
+                c6[i] >> 
+                c12[i] >>
+                q1[i] >>
+                q2[i];
+        }
+        gpuSetLJ14Parameters(gpu, epsfac, fudge, atom1, atom2, c6, c12, q1, q2);
+        return LJ14s;
+    }
+    else
+    {
+        cout << "Error opening Lennard-Jones 1-4 parameter file " << fname << endl;
+        exit(-1);
+    }
+    return 0;
+}
+
+extern "C"
+void gpuSetLJ14Parameters(gpuContext gpu, float epsfac, float fudge, const vector<int>& atom1, const vector<int>& atom2,
+        const vector<float>& c6, const vector<float>& c12, const vector<float>& q1, const vector<float>& q2)
+{
+    int LJ14s = atom1.size();
+    float scale = epsfac * fudge;
+
+    gpu->sim.LJ14s                              = LJ14s;
+    CUDAStream<int4>* psLJ14ID                  = new CUDAStream<int4>(LJ14s, 1);
+    gpu->psLJ14ID                               = psLJ14ID;
+    gpu->sim.pLJ14ID                            = psLJ14ID->_pDevStream[0];
+    CUDAStream<float4>* psLJ14Parameter         = new CUDAStream<float4>(LJ14s, 1);
+    gpu->psLJ14Parameter                        = psLJ14Parameter;
+    gpu->sim.pLJ14Parameter                     = psLJ14Parameter->_pDevStream[0];
+
+    for (int i = 0; i < LJ14s; i++)
+    {
+        psLJ14ID->_pSysStream[0][i].x = atom1[i];
+        psLJ14ID->_pSysStream[0][i].y = atom2[i];
+        psLJ14ID->_pSysStream[0][i].z = gpu->pOutputBufferCounter[psLJ14ID->_pSysStream[0][i].x]++;
+        psLJ14ID->_pSysStream[0][i].w = gpu->pOutputBufferCounter[psLJ14ID->_pSysStream[0][i].y]++;
+        float p0, p1, p2;
+        if (c12[i] == 0.0f)
+        {
+            p0 = 0.0f;
+            p1 = 1.0f;
+        }
+        else
+        {
+            p0 = c6[i] * c6[i] / c12[i];
+            p1 = pow(c12[i] / c6[i], 1.0f / 6.0f);
+        }
+        p2 = scale * q1[i] * q2[i];
+        psLJ14Parameter->_pSysStream[0][i].x = p0;
+        psLJ14Parameter->_pSysStream[0][i].y = p1;
+        psLJ14Parameter->_pSysStream[0][i].z = p2;
+    }
+#if (DUMP_PARAMETERS == 1)
+        cout << 
+            i << " " <<
+            psLJ14ID->_pSysStream[0][i].x << " " << 
+            psLJ14ID->_pSysStream[0][i].y << " " << 
+            psLJ14ID->_pSysStream[0][i].z << " " << 
+            psLJ14ID->_pSysStream[0][i].w << " " << 
+            psLJ14Parameter->_pSysStream[0][i].x << " " << 
+            psLJ14Parameter->_pSysStream[0][i].y << " " <<
+            psLJ14Parameter->_pSysStream[0][i].z << " " << 
+            p0 << " " << 
+            p1 << " " << 
+            p2 << " " << 
+            endl;
+#endif
+    psLJ14ID->Upload();
+    psLJ14Parameter->Upload();
+}
+
+extern "C"
+float gpuGetAtomicRadius(gpuContext gpu, string s)
+{
+    for (int i = 0; i < gpu->gAtomTypes; i++)
+    {
+        if (s == gpu->gpAtomTable[i].name)
+        {
+            return gpu->gpAtomTable[i].r;
+        }
+    }
+    
+    return 0.0f;
+}
+
+extern "C"
+unsigned char gpuGetAtomicSymbol(gpuContext gpu, string s)
+{
+    for (int i = 0; i < gpu->gAtomTypes; i++)
+    {
+        if (s == gpu->gpAtomTable[i].name)
+        {
+            return gpu->gpAtomTable[i].symbol;
+        }
+    }
+    
+    return ' ';
+}
+
+extern "C"
+int gpuReadAtomicParameters(gpuContext gpu, char* fname)
+{
+    gpu->gAtomTypes = 0;
+    if (gpu->gpAtomTable)
+        delete[] gpu->gpAtomTable;
+    
+    // Read file once to count atom types
+    ifstream infile(fname);
+    
+    if (!infile.fail())
+    {
+        char buff[1024];
+        int skips = 0;
+        bool skipflag = true;
+        while (infile.getline(buff, 512))
+        {
+            if (buff[0] == ' ')
+            {
+                skipflag = false;
+                gpu->gAtomTypes++;
+            }
+            else if (skipflag)
+                skips++;
+        }
+        infile.close();
+        
+        gpu->gpAtomTable = new gpuAtomType[gpu->gAtomTypes];
+        ifstream infile1(fname);
+        for (int i = 0; i < skips; i++)
+        {
+            infile1.getline(buff, 512);
+        }
+        for (int i = 0; i < gpu->gAtomTypes; i++)
+        {
+            infile1 >> gpu->gpAtomTable[i].name >> gpu->gpAtomTable[i].r;
+            infile1.getline(buff, 512);
+        
+            // Determine symbol
+            if (gpu->gpAtomTable[i].r < 1.3f)
+                gpu->gpAtomTable[i].symbol = 'H';
+            else if (gpu->gpAtomTable[i].r < 1.6f)
+                gpu->gpAtomTable[i].symbol = 'O';
+            else if (gpu->gpAtomTable[i].r < 1.7f)
+                gpu->gpAtomTable[i].symbol = 'N';
+            else
+                gpu->gpAtomTable[i].symbol = 'C';
+
+#if (DUMP_PARAMETERS == 1)            
+            cout << i << " " << gpu->gpAtomTable[i].name << " " << gpu->gpAtomTable[i].symbol << " " << gpu->gpAtomTable[i].r << endl; 
+#endif
+        }
+        return gpu->gAtomTypes;
+    }
+    else
+    {
+        cout << "Error opening atom parameter file " << fname << endl;
+        exit(-1);
+    }
+    return 0;   
+
+}
+
+extern "C"
+int gpuReadCoulombParameters(gpuContext gpu, char* fname)
+{
+    ifstream infile(fname);
+    
+    if (!infile.fail())
+    {
+        char buff[1024];
+        unsigned int coulombs;
+        float fudge = 0.0f;
+        float epsfac = 1.0f;
+        infile >> coulombs;
+        infile.get(buff, 9);
+        infile >> epsfac;
+        infile.get(buff, 8);
+        infile >> fudge;
+        infile.getline(buff, 512);
+        vector<int> atom(coulombs);
+        vector<float> c6(coulombs);
+        vector<float> c12(coulombs);
+        vector<float> q(coulombs);
+        vector<float> radius(coulombs);
+        vector<float> scale(coulombs);
+        vector<char> symbol(coulombs);
+        vector<vector<int> > exclusions(coulombs);
+        unsigned int total_exclusions = 0;
+
+        for (unsigned int i = 0; i < coulombs; i++)
+        {
+            int junk, numExclusions;
+            char atype[512];
+            infile >> 
+                junk >> 
+                c6[i] >>
+                c12[i] >>
+                q[i] >>
+                atype >>
+                scale[i] >>
+                numExclusions;
+                radius[i] = gpuGetAtomicRadius(gpu, atype);
+                symbol[i] = gpuGetAtomicSymbol(gpu, atype);
+                for (int j = 0; j < numExclusions; j++)
+                {
+                    int exclusion;
+                    infile >> exclusion;
+                    exclusions[i].push_back(exclusion);
+                }
+        }
+        cout << total_exclusions << " total exclusions.\n";
+        gpuSetCoulombParameters(gpu, epsfac, atom, c6, c12, q, symbol, exclusions);
+        gpuSetObcParameters(gpu, defaultInnerDielectric, defaultSolventDielectric, atom, radius, scale);
+        return coulombs;
+    }
+    else
+    {
+        cout << "Error opening Coulomb parameter file " << fname << endl;
+        exit(-1);
+    }
+    return 0;
+}
+
+extern "C"
+void gpuSetCoulombParameters(gpuContext gpu, float epsfac, const vector<int>& atom, const vector<float>& c6, const vector<float>& c12, const vector<float>& q,
+        const vector<char>& symbol, const vector<vector<int> >& exclusions)
+{
+    unsigned int coulombs = atom.size();
+    gpu->sim.epsfac = epsfac;
+    unsigned int total_exclusions = 0;
+
+    for (unsigned int i = 0; i < coulombs; i++)
+    {
+            float p0 = q[i];
+            float p1 = 0.5f, p2 = 0.0f;               
+            if ((c6[i] > 0.0f) && (c12[i] > 0.0f))
+            {
+                p1 = 0.5f * pow(c12[i] / c6[i], 1.0f / 6.0f);
+                p2 = c6[i] * sqrt(1.0f / c12[i]);
+            }
+            if (symbol.size() > 0)
+                gpu->pAtomSymbol[i] = symbol[i];
+            gpu->psPosq4->_pSysStream[0][i].w = p0;
+            gpu->psSigEps2->_pSysStream[0][i].x = p1;
+            gpu->psSigEps2->_pSysStream[0][i].y = p2;
+
+#if (DUMP_PARAMETERS == 1)
+        cout << 
+            i << " " << 
+            gpu->psPosq4->_pSysStream[0][i].w << " " << 
+            gpu->psSigEps2->_pSysStream[0][i].x << " " <<
+            gpu->psSigEps2->_pSysStream[0][i].y << " " << 
+            p0 << " " <<
+            p1 << " " <<
+            p2 << " " <<
+            exclusions;
+#endif
+            for (int j = 0; j < (int) exclusions[i].size(); j++)
+            {
+#if (DUMP_PARAMETERS == 1)
+                cout << " " << exclusions[i][j];
+#endif
+
+                gpu->pExclusion[i * gpu->sim.paddedNumberOfAtoms + exclusions[i][j]] = 0;
+                if (i >= (int) exclusions[i][j])
+                {
+                    total_exclusions++;
+                }
+            }
+#if (DUMP_PARAMETERS == 1)
+            cout << endl;
+#endif
+
+    }
+
+    // Dummy out extra atom data
+    for (unsigned int i = coulombs; i < gpu->sim.paddedNumberOfAtoms; i++)
+    {
+        gpu->psPosq4->_pSysStream[0][i].x       = 100000.0f + i * 10.0f;
+        gpu->psPosq4->_pSysStream[0][i].y       = 100000.0f + i * 10.0f;
+        gpu->psPosq4->_pSysStream[0][i].z       = 100000.0f + i * 10.0f;
+        gpu->psPosq4->_pSysStream[0][i].w       = 0.0f;
+        gpu->psSigEps2->_pSysStream[0][i].x     = 0.0f;
+        gpu->psSigEps2->_pSysStream[0][i].y     = 0.0f;   
+    }
+
+    // Add in remaining exclusions
+    for (unsigned int i = coulombs; i < gpu->sim.paddedNumberOfAtoms; i++)
+    {
+        for (unsigned int j = 0; j < gpu->sim.paddedNumberOfAtoms; j++)
+        {
+            gpu->pExclusion[i * gpu->sim.paddedNumberOfAtoms + j] = 0;
+            gpu->pExclusion[j * gpu->sim.paddedNumberOfAtoms + i] = 0;
+        }
+    }
+
+    gpu->psPosq4->Upload();
+    gpu->psSigEps2->Upload();
+
+    // Check for exclusion consistency
+    for (unsigned int i = 0; i < coulombs; i++)
+    {
+        for (unsigned int j = i; j < coulombs; j++)
+        {
+
+            if (gpu->pExclusion[i * gpu->sim.paddedNumberOfAtoms + j] != gpu->pExclusion[j * gpu->sim.paddedNumberOfAtoms + i])
+                cout << "Warning: inconsistent exclusion betweens atoms " << i << " and " << j << endl;
+        }
+    }
+}
+
+extern "C"
+void gpuSetObcParameters(gpuContext gpu, float innerDielectric, float solventDielectric, const vector<int>& atom, const vector<float>& radius, const vector<float>& scale)
+{
+    unsigned int atoms = atom.size();
+    
+    for (unsigned int i = 0; i < atoms; i++)
+    {
+            gpu->psObcData->_pSysStream[0][i].x = radius[i] - dielectricOffset;
+            gpu->psObcData->_pSysStream[0][i].y = scale[i] * gpu->psObcData->_pSysStream[0][i].x;
+
+#if (DUMP_PARAMETERS == 1)
+        cout << 
+            i << " " << 
+            gpu->psObcData->_pSysStream[0][i].x << " " <<
+            gpu->psObcData->_pSysStream[0][i].y;
+#endif
+    }
+
+    // Dummy out extra atom data
+    for (unsigned int i = atoms; i < gpu->sim.paddedNumberOfAtoms; i++)
+    {
+        gpu->psBornRadii->_pSysStream[0][i]     = 0.2f;
+        gpu->psObcData->_pSysStream[0][i].x     = 0.01f;
+        gpu->psObcData->_pSysStream[0][i].y     = 0.01f;
+    }
+
+    gpu->psBornRadii->Upload();
+    gpu->psObcData->Upload();
+    gpu->sim.preFactor = 2.0f*electricConstant*((1.0f/innerDielectric)-(1.0f/solventDielectric))*gpu->sim.forceConversionFactor;
+}
+
+extern "C"
+int gpuReadShakeParameters(gpuContext gpu, char* fname)
+{
+    ifstream infile(fname);
+    if (!infile.fail())
+    {
+        char buff[512];
+        int shake_constraints;
+        infile >> buff >> shake_constraints;
+        infile.getline(buff, 512);
+        vector<int> atom1(shake_constraints);
+        vector<int> atom2(shake_constraints);
+        vector<float> distance(shake_constraints);
+        vector<float> invMass1(shake_constraints);
+        vector<float> invMass2(shake_constraints);
+
+        for (int i = 0; i < shake_constraints; i++)
+        {
+            int junk;
+            infile >> 
+                junk >> 
+                atom1[i] >> 
+                atom2[i] >> 
+                distance[i] >> 
+                invMass1[i] >> 
+                invMass2[i];
+        }
+        gpuSetShakeParameters(gpu, atom1, atom2, distance, invMass1, invMass2, 1e-4f);
+        return gpu->sim.ShakeConstraints;
+    }
+    else
+    {
+        cout << "Error opening Shake parameter file " << fname << endl;
+        exit(-1);
+    }
+    return 0;
+}
+
+extern "C"
+void gpuSetShakeParameters(gpuContext gpu, const vector<int>& atom1, const vector<int>& atom2, const vector<float>& distance,
+        const vector<float>& invMass1, const vector<float>& invMass2, float tolerance)
+{
+    // Find how many constraints each atom is involved in.
+    
+    vector<int> constraintCount(gpu->natoms, 0);
+    for (int i = 0; i < atom1.size(); i++) {
+        constraintCount[atom1[i]]++;
+        constraintCount[atom2[i]]++;
+    }
+    
+    // Find clusters consisting of a central atom with up to three peripheral atoms.
+    
+    map<int, ShakeCluster> clusters;
+    for (int i = 0; i < atom1.size(); i++) {
+        // Determine which is the central atom.
+        
+        bool firstIsCentral;
+        if (constraintCount[atom1[i]] > 1)
+            firstIsCentral = true;
+        else if (constraintCount[atom2[i]] > 1)
+            firstIsCentral = false;
+        else if (atom1[i] < atom2[i])
+            firstIsCentral = true;
+        else
+            firstIsCentral = false;
+        int centralID, peripheralID;
+        float centralInvMass, peripheralInvMass;
+        if (firstIsCentral) {
+            centralID = atom1[i];
+            peripheralID = atom2[i];
+            centralInvMass = invMass1[i];
+            peripheralInvMass = invMass2[i];
+        }
+        else {
+            centralID = atom2[i];
+            peripheralID = atom1[i];
+            centralInvMass = invMass2[i];
+            peripheralInvMass = invMass1[i];
+        }
+        if (constraintCount[peripheralID] != 1)
+            throw OpenMMException("Only bonds to hydrogens may be constrained");
+        
+        // Add it to the cluster.
+        
+        if (clusters.find(centralID) == clusters.end()) {
+            clusters[centralID] = ShakeCluster(centralID, centralInvMass);
+        }
+        clusters[centralID].addAtom(peripheralID, distance[i], peripheralInvMass);
+    }
+    
+    // Fill in the Cuda streams.
+    
+    CUDAStream<int4>* psShakeID             = new CUDAStream<int4>((int) clusters.size(), 1);
+    gpu->psShakeID                          = psShakeID;
+    gpu->sim.pShakeID                       = psShakeID->_pDevStream[0]; 
+    CUDAStream<float4>* psShakeParameter    = new CUDAStream<float4>((int) clusters.size(), 1);
+    gpu->psShakeParameter                   = psShakeParameter;
+    gpu->sim.pShakeParameter                = psShakeParameter->_pDevStream[0];
+    gpu->sim.ShakeConstraints               = clusters.size();
+    int index = 0;
+    for (map<int, ShakeCluster>::const_iterator iter = clusters.begin(); iter != clusters.end(); ++iter) {
+        const ShakeCluster& cluster = iter->second;
+        psShakeID->_pSysStream[0][index].x = cluster.centralID;
+        psShakeID->_pSysStream[0][index].y = cluster.peripheralID[0];
+        psShakeID->_pSysStream[0][index].z = cluster.size > 1 ? cluster.peripheralID[1] : -1;
+        psShakeID->_pSysStream[0][index].w = cluster.size > 2 ? cluster.peripheralID[2] : -1;
+        psShakeParameter->_pSysStream[0][index].x = cluster.centralInvMass;
+        psShakeParameter->_pSysStream[0][index].y = 0.5f/(cluster.centralInvMass+cluster.peripheralInvMass);
+        psShakeParameter->_pSysStream[0][index].z = cluster.distance*cluster.distance;
+        psShakeParameter->_pSysStream[0][index].w = cluster.peripheralInvMass;
+        ++index;
+    }
+    psShakeID->Upload();
+    psShakeParameter->Upload();
+    gpu->sim.shakeTolerance = tolerance;
+
+    gpu->sim.shake_threads_per_block     = (gpu->sim.ShakeConstraints + gpu->sim.blocks - 1) / gpu->sim.blocks; 
+    if (gpu->sim.shake_threads_per_block > gpu->sim.max_shake_threads_per_block)
+        gpu->sim.shake_threads_per_block = gpu->sim.max_shake_threads_per_block;
+    if (gpu->sim.shake_threads_per_block < 1)
+        gpu->sim.shake_threads_per_block = 1;
+
+#ifdef DeltaShake
+
+    // count number of atoms w/o constraint
+
+    int count = 0;
+    for (int i = 0; i < gpu->natoms; i++)
+       if (constraintCount[i] == 0)
+          count++;
+
+    // Allocate NonShake parameters
+
+    gpu->sim.NonShakeConstraints                  = count;
+    if( count || true ){
+
+       CUDAStream<int>* psNonShakeID              = new CUDAStream<int>(count, 1);
+       gpu->psNonShakeID                          = psNonShakeID;
+       gpu->sim.pNonShakeID                       = psNonShakeID->_pDevStream[0];
+
+       gpu->sim.nonshake_threads_per_block        = (count + gpu->sim.blocks - 1) / gpu->sim.blocks;
+
+       if (gpu->sim.nonshake_threads_per_block > gpu->sim.max_shake_threads_per_block)
+           gpu->sim.nonshake_threads_per_block = gpu->sim.max_shake_threads_per_block;
+
+       if (gpu->sim.nonshake_threads_per_block < 1)
+               gpu->sim.nonshake_threads_per_block = 1;
+
+       // load indices
+
+       count = 0;
+       for (int i = 0; i < gpu->natoms; i++){
+          if (constraintCount[i] == 0){
+             psNonShakeID->_pSysStream[0][count++] = i;
+          }
+       }
+       psNonShakeID->Upload();
+
+    } else {
+       gpu->sim.nonshake_threads_per_block           = 0;
+    }
+#endif
+}
+
+extern "C"
+int gpuAllocateInitialBuffers(gpuContext gpu)
+{
+    gpu->sim.atoms                      = gpu->natoms;
+    gpu->sim.paddedNumberOfAtoms        = ((gpu->sim.atoms + GRID - 1) >> GRIDBITS) << GRIDBITS;
+    gpu->sim.degreesOfFreedom           = 3 * gpu->sim.atoms - 6;
+    gpu->gpAtomTable                    = NULL;
+    gpu->gAtomTypes                     = 0;
+    gpu->sim.nonbondOutputBuffers       = gpu->sim.paddedNumberOfAtoms / GRID;
+    gpu->sim.totalNonbondOutputBuffers  = 2 * gpu->sim.nonbondOutputBuffers;
+    gpu->sim.outputBuffers              = gpu->sim.totalNonbondOutputBuffers;
+    gpu->psPosq4                        = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, 1);
+    gpu->sim.stride                     = gpu->psPosq4->_stride;
+    gpu->sim.stride2                    = gpu->sim.stride * 2;
+    gpu->sim.stride3                    = gpu->sim.stride * 3;
+    gpu->sim.stride4                    = gpu->sim.stride * 4;
+    gpu->sim.pPosq                      = gpu->psPosq4->_pDevStream[0];
+    gpu->sim.stride                     = gpu->psPosq4->_stride;
+    gpu->sim.stride2                    = 2 * gpu->sim.stride;
+    gpu->sim.stride3                    = 3 * gpu->sim.stride;
+    gpu->sim.stride4                    = 4 * gpu->sim.stride;
+    gpu->sim.exclusionStride            = gpu->sim.stride / GRID;
+    gpu->psPosqP4                       = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, 1);
+    gpu->sim.pPosqP                     = gpu->psPosqP4->_pDevStream[0];
+    gpu->psOldPosq4                     = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, 1);
+    gpu->sim.pOldPosq                   = gpu->psOldPosq4->_pDevStream[0];
+    gpu->psVelm4                        = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, 1);
+    gpu->sim.pVelm4                     = gpu->psVelm4->_pDevStream[0];
+    gpu->psvVector4                     = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, 1);
+    gpu->sim.pvVector4                  = gpu->psvVector4->_pDevStream[0];
+    gpu->psxVector4                     = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, 1);
+    gpu->sim.pxVector4                  = gpu->psxVector4->_pDevStream[0];
+    gpu->psBornRadii                    = new CUDAStream<float>(gpu->sim.paddedNumberOfAtoms, 1);
+    gpu->sim.pBornRadii                 = gpu->psBornRadii->_pDevStream[0];
+    gpu->psObcChain                     = new CUDAStream<float>(gpu->sim.paddedNumberOfAtoms, 1);
+    gpu->sim.pObcChain                  = gpu->psObcChain->_pDevStream[0];
+    gpu->psSigEps2                      = new CUDAStream<float2>(gpu->sim.paddedNumberOfAtoms, 1);
+    gpu->sim.pAttr                      = gpu->psSigEps2->_pDevStream[0];
+    gpu->psObcData                      = new CUDAStream<float2>(gpu->sim.paddedNumberOfAtoms, 1);
+    gpu->sim.pObcData                   = gpu->psObcData->_pDevStream[0];
+    gpu->pAtomSymbol                    = new unsigned char[gpu->natoms];
+
+    // Determine randoms
+    gpu->seed                           = (unsigned long)time(NULL) & 0x000fffff;
+    gpu->sim.randomFrames               = 995;
+    gpu->sim.randomIterations           = gpu->sim.randomFrames;
+    gpu->sim.randoms                    = gpu->sim.randomFrames * gpu->sim.paddedNumberOfAtoms - 5 * GRID;
+    gpu->sim.totalRandoms               = gpu->sim.randoms + gpu->sim.paddedNumberOfAtoms;
+    gpu->sim.totalRandomsTimesTwo       = gpu->sim.totalRandoms * 2;
+    gpu->psRandom4                      = new CUDAStream<float4>(gpu->sim.totalRandomsTimesTwo, 1);
+    gpu->psRandom2                      = new CUDAStream<float2>(gpu->sim.totalRandomsTimesTwo, 1);
+    gpu->psRandomPosition               = new CUDAStream<int>(gpu->sim.blocks, 1);
+    gpu->psRandomSeed                   = new CUDAStream<uint4>(gpu->sim.blocks * gpu->sim.random_threads_per_block, 1);
+    gpu->sim.pRandom4a                  = gpu->psRandom4->_pDevStream[0];
+    gpu->sim.pRandom2a                  = gpu->psRandom2->_pDevStream[0];
+    gpu->sim.pRandom4b                  = gpu->psRandom4->_pDevStream[0] + gpu->sim.totalRandoms;
+    gpu->sim.pRandom2b                  = gpu->psRandom2->_pDevStream[0] + gpu->sim.totalRandoms;
+    gpu->sim.pRandomPosition            = gpu->psRandomPosition->_pDevStream[0];
+    gpu->sim.pRandomSeed                = gpu->psRandomSeed->_pDevStream[0];
+    for (int i = 0; i < (int) gpu->sim.blocks; i++)
+    {
+        gpu->psRandomPosition->_pSysStream[0][i] = 0;
+    }
+    int seed = gpu->seed | ((gpu->seed ^ 0xffffffff) << 16);
+    srand(seed);
+    for (int i = 0; i < (int) (gpu->sim.blocks * gpu->sim.random_threads_per_block); i++)
+    {
+        gpu->psRandomSeed->_pSysStream[0][i].x = rand();
+        gpu->psRandomSeed->_pSysStream[0][i].y = rand();
+        gpu->psRandomSeed->_pSysStream[0][i].z = rand();
+        gpu->psRandomSeed->_pSysStream[0][i].w = rand();
+    }
+
+    float randomValue = 0.0f;
+    for (int i = 0; i < (int) gpu->sim.totalRandomsTimesTwo; i++)
+    {
+        gpu->psRandom4->_pSysStream[0][i].x         = randomValue;
+        gpu->psRandom4->_pSysStream[0][i].y         = randomValue;
+        gpu->psRandom4->_pSysStream[0][i].z         = randomValue;
+        gpu->psRandom4->_pSysStream[0][i].w         = randomValue;
+        gpu->psRandom2->_pSysStream[0][i].x         = randomValue;
+        gpu->psRandom2->_pSysStream[0][i].y         = randomValue;
+    }
+
+    gpu->psRandomSeed->Upload();
+    gpu->psRandom4->Upload();
+    gpu->psRandom2->Upload();
+    gpu->psRandomPosition->Upload();
+
+    // Allocate and clear linear momentum buffer
+    gpu->psLinearMomentum = new CUDAStream<float4>(gpu->sim.blocks, 1);
+    gpu->sim.pLinearMomentum = gpu->psLinearMomentum->_pDevStream[0];
+    for (int i = 0; i < (int) gpu->sim.blocks; i++)
+    {
+        gpu->psLinearMomentum->_pSysStream[0][i].x = 0.0f;
+        gpu->psLinearMomentum->_pSysStream[0][i].y = 0.0f;
+        gpu->psLinearMomentum->_pSysStream[0][i].z = 0.0f;
+        gpu->psLinearMomentum->_pSysStream[0][i].w = 0.0f;
+    }
+    gpu->psLinearMomentum->Upload();
+
+    return 1;
+}
+
+extern "C"
+void gpuReadCoordinates(gpuContext gpu, char* fname)
+{
+    ifstream infile(fname);
+    gpu->natoms = 0;
+    char buff[512];
+    infile >> buff >> gpu->natoms;
+    infile.getline(buff, 511);
+    float totalMass = 0.0f;
+
+    gpuAllocateInitialBuffers(gpu);
+    
+    for (int i = 0; i < gpu->natoms; i++)
+    {
+        int junk;
+        infile >> junk >> 
+            gpu->psPosq4->_pSysStream[0][i].x >> 
+            gpu->psPosq4->_pSysStream[0][i].y >> 
+            gpu->psPosq4->_pSysStream[0][i].z >>
+            gpu->psPosq4->_pSysStream[0][i].w >>
+            gpu->psVelm4->_pSysStream[0][i].x >> 
+            gpu->psVelm4->_pSysStream[0][i].y >> 
+            gpu->psVelm4->_pSysStream[0][i].z >>
+            gpu->psVelm4->_pSysStream[0][i].w;
+        gpu->psxVector4->_pSysStream[0][i].x = 0.0f;
+        gpu->psxVector4->_pSysStream[0][i].y = 0.0f;
+        gpu->psxVector4->_pSysStream[0][i].z = 0.0f;
+        gpu->psxVector4->_pSysStream[0][i].w = 0.0f;
+
+        // Accumulate mass
+        totalMass += 1.0f / gpu->psVelm4->_pSysStream[0][i].w;
+    }
+    
+    gpu->sim.inverseTotalMass = 1.0f / totalMass;
+    gpu->psPosq4->Upload();
+    gpu->psVelm4->Upload();
+    gpu->psxVector4->Upload();
+}
+
+extern "C"
+void gpuSetPositions(gpuContext gpu, const vector<float>& x, const vector<float>& y, const vector<float>& z)
+{
+    for (int i = 0; i < gpu->natoms; i++)
+    {
+        gpu->psPosq4->_pSysStream[0][i].x = x[i];
+        gpu->psPosq4->_pSysStream[0][i].y = y[i];
+        gpu->psPosq4->_pSysStream[0][i].z = z[i];
+    }
+    gpu->psPosq4->Upload();
+
+	 // set flag to recalculate Born radii
+
+	 gpu->bRecalculateBornRadii = true;
+} 
+
+extern "C"
+void gpuSetVelocities(gpuContext gpu, const vector<float>& x, const vector<float>& y, const vector<float>& z)
+{
+    for (int i = 0; i < gpu->natoms; i++)
+    {
+        gpu->psVelm4->_pSysStream[0][i].x = x[i];
+        gpu->psVelm4->_pSysStream[0][i].y = y[i];
+        gpu->psVelm4->_pSysStream[0][i].z = z[i];
+    }
+    gpu->psVelm4->Upload();
+} 
+
+extern "C"
+void gpuSetMass(gpuContext gpu, const vector<float>& mass)
+{
+    float totalMass = 0.0f;
+    for (int i = 0; i < gpu->natoms; i++)
+    {
+        gpu->psVelm4->_pSysStream[0][i].w = 1.0f/mass[i];
+        totalMass += mass[i];
+    }
+    gpu->sim.inverseTotalMass = 1.0f / totalMass;
+    gpu->psVelm4->Upload();
+} 
+
+extern "C"
+void gpuInitializeRandoms(gpuContext gpu)
+{
+    for (int i = 0; i < (int) gpu->sim.blocks; i++)
+    {
+        gpu->psRandomPosition->_pSysStream[0][i] = 0;
+    }
+    int seed = gpu->seed | ((gpu->seed ^ 0xffffffff) << 16);
+    srand(seed);
+    for (int i = 0; i < (int) (gpu->sim.blocks * gpu->sim.random_threads_per_block); i++)
+    {
+        gpu->psRandomSeed->_pSysStream[0][i].x = rand();
+        gpu->psRandomSeed->_pSysStream[0][i].y = rand();
+        gpu->psRandomSeed->_pSysStream[0][i].z = rand();
+        gpu->psRandomSeed->_pSysStream[0][i].w = rand();
+    }
+    gpu->psRandomPosition->Upload();
+    gpu->psRandomSeed->Upload();
+    gpuSetConstants(gpu);
+    kGenerateRandoms(gpu);
+    return;
+}
+
+extern "C"
+bool gpuIsAvailable()
+{
+    int deviceCount;
+    cudaGetDeviceCount(&deviceCount);
+    return (deviceCount > 0);
+}
+
+extern "C"
+void* gpuInitFromFile(char* fname) 
+{
+    ifstream infile(fname);
+    int numAtoms = 0;
+    char buff[512];
+    infile >> buff >> numAtoms;
+    gpuContext gpu = (gpuContext) gpuInit(numAtoms);
+    vector<float> x(numAtoms), y(numAtoms), z(numAtoms), charge(numAtoms), vx(numAtoms), vy(numAtoms), vz(numAtoms), mass(numAtoms);
+    infile.getline(buff, 511);
+    float totalMass = 0.0f;
+    for (int i = 0; i < gpu->natoms; i++)
+    {
+        int junk;
+        infile >> junk >> 
+            x[i] >> 
+            y[i] >> 
+            z[i] >> 
+            charge[i] >> 
+            vx[i] >> 
+            vy[i] >> 
+            vz[i] >> 
+            mass[i];
+        mass[i] = 1.0f/mass[i];
+    }
+    gpuSetPositions(gpu, x, y, z);
+    gpuSetVelocities(gpu, vx, vy, vz);
+    gpuSetMass(gpu, mass);
+    return (void*)gpu;
+}
+
+extern "C"
+void* gpuInit(int numAtoms)
+{
+    gpuContext gpu = new _gpuContext;
+    int LRFSize = 0;
+    int SMCount = 0;
+    int SMMajor = 0;
+    int SMMinor = 0;
+
+    // Get adapter
+    unsigned int device = 0;
+    char * pAdapter;
+    pAdapter = getenv ("NV_FAH_DEVICE");
+    if (pAdapter != NULL)
+    {
+        sscanf(pAdapter, "%d", &device);
+    }
+    cudaError_t status = cudaSetDevice(device);
+    RTERROR(status, "Error setting CUDA device")
+
+    // Determine which core to run on
+#if 0
+    SYSTEM_INFO info;
+    GetSystemInfo(&info);
+    unsigned int cores = info.dwNumberOfProcessors;
+    if (cores > 1)
+    {
+        HANDLE hproc = GetCurrentProcess();
+        unsigned int core = (cores - 1) - (device % (cores - 1)); 
+        unsigned int mask = 1 << core;
+        SetProcessAffinityMask(hproc, mask);
+    }
+#endif
+
+    // Determine kernel call configuration
+    cudaDeviceProp deviceProp;
+    cudaGetDeviceProperties(&deviceProp, 0);
+
+    // Determine SM version
+    if (deviceProp.major == 1)
+    {
+        switch (deviceProp.minor)
+        {
+        case 0:
+        case 1:
+            gpu->sm_version = SM_10;
+            gpu->sim.workUnitsPerSM = G8X_NONBOND_WORKUNITS_PER_SM;
+            break;
+
+        default:
+            gpu->sm_version = SM_12;
+            gpu->sim.workUnitsPerSM = GT2XX_NONBOND_WORKUNITS_PER_SM;
+            break;
+        }
+    }
+
+    gpu->sim.nonbond_blocks = deviceProp.multiProcessorCount;
+    gpu->sim.bornForce2_blocks = deviceProp.multiProcessorCount;
+    gpu->sim.blocks = deviceProp.multiProcessorCount;
+    if (deviceProp.regsPerBlock == 8192)
+    {
+        gpu->sim.nonbond_threads_per_block          = G8X_NONBOND_THREADS_PER_BLOCK;
+        gpu->sim.bornForce2_threads_per_block       = G8X_BORNFORCE2_THREADS_PER_BLOCK;
+        gpu->sim.max_shake_threads_per_block        = G8X_SHAKE_THREADS_PER_BLOCK;
+        gpu->sim.max_update_threads_per_block       = G8X_UPDATE_THREADS_PER_BLOCK;
+        gpu->sim.max_localForces_threads_per_block  = G8X_LOCALFORCES_THREADS_PER_BLOCK;
+        gpu->sim.threads_per_block                  = G8X_THREADS_PER_BLOCK;
+        gpu->sim.random_threads_per_block           = G8X_RANDOM_THREADS_PER_BLOCK;
+    }
+    else
+    {
+        gpu->sim.nonbond_threads_per_block          = GT2XX_NONBOND_THREADS_PER_BLOCK;
+        gpu->sim.bornForce2_threads_per_block       = GT2XX_BORNFORCE2_THREADS_PER_BLOCK;
+        gpu->sim.max_shake_threads_per_block        = GT2XX_SHAKE_THREADS_PER_BLOCK;
+        gpu->sim.max_update_threads_per_block       = GT2XX_UPDATE_THREADS_PER_BLOCK;
+        gpu->sim.max_localForces_threads_per_block  = GT2XX_LOCALFORCES_THREADS_PER_BLOCK;
+        gpu->sim.threads_per_block                  = GT2XX_NONBOND_THREADS_PER_BLOCK;
+        gpu->sim.random_threads_per_block           = GT2XX_RANDOM_THREADS_PER_BLOCK;
+    }
+    gpu->sim.shake_threads_per_block                = gpu->sim.max_shake_threads_per_block;
+    gpu->sim.localForces_threads_per_block          = gpu->sim.max_localForces_threads_per_block;
+
+    gpu->natoms = numAtoms;
+    gpuAllocateInitialBuffers(gpu);
+    for (int i = 0; i < gpu->natoms; i++)
+    {
+        gpu->psxVector4->_pSysStream[0][i].x = 0.0f;
+        gpu->psxVector4->_pSysStream[0][i].y = 0.0f;
+        gpu->psxVector4->_pSysStream[0][i].z = 0.0f;
+        gpu->psxVector4->_pSysStream[0][i].w = 0.0f;
+    }
+    gpu->psxVector4->Upload();
+
+    gpu->iterations = 0;
+    gpu->sim.update_threads_per_block               = (gpu->natoms + gpu->sim.blocks - 1) / gpu->sim.blocks;
+    if (gpu->sim.update_threads_per_block > gpu->sim.max_update_threads_per_block)
+        gpu->sim.update_threads_per_block = gpu->sim.max_update_threads_per_block;
+    if (gpu->sim.update_threads_per_block < 1)
+            gpu->sim.update_threads_per_block = 1;
+    gpu->sim.bf_reduce_threads_per_block = gpu->sim.update_threads_per_block;
+    gpu->sim.bsf_reduce_threads_per_block = (gpu->sim.stride4 + gpu->natoms + gpu->sim.blocks - 1) / gpu->sim.blocks;
+    gpu->sim.bsf_reduce_threads_per_block = ((gpu->sim.bsf_reduce_threads_per_block + (GRID - 1)) / GRID) * GRID;
+    if (gpu->sim.bsf_reduce_threads_per_block > gpu->sim.threads_per_block)
+        gpu->sim.bsf_reduce_threads_per_block = gpu->sim.threads_per_block;
+    if (gpu->sim.bsf_reduce_threads_per_block < 1)
+        gpu->sim.bsf_reduce_threads_per_block = 1;
+
+    // Initialize constants to reasonable values
+    gpu->sim.probeRadius            = probeRadius;
+    gpu->sim.surfaceAreaFactor      = surfaceAreaFactor;
+    gpu->sim.electricConstant       = electricConstant;
+
+    gpu->sim.bigFloat               = 99999999.0f;
+    gpu->sim.forceConversionFactor  = forceConversionFactor;
+    gpu->sim.preFactor              = 2.0f*electricConstant*((1.0f/defaultInnerDielectric)-(1.0f/defaultSolventDielectric))*gpu->sim.forceConversionFactor;
+    gpu->sim.dielectricOffset       = dielectricOffset;
+    gpu->sim.alphaOBC               = alphaOBC;
+    gpu->sim.betaOBC                = betaOBC;
+    gpu->sim.gammaOBC               = gammaOBC;
+    gpuSetIntegrationParameters(gpu, 1.0f, 2.0e-3f, 300.0f);
+    gpu->sim.maxShakeIterations     = 15;
+    gpu->sim.shakeTolerance         = 1.0e-04f * 2.0f;
+    gpu->sim.InvMassJ               = 9.920635e-001f;
+    gpu->grid                       = GRID;
+    gpu->bCalculateCM               = false;
+    gpu->bRemoveCM                  = false;
+    gpu->bRecalculateBornRadii      = true;
+    gpuInitializeRandoms(gpu);
+
+    // To be determined later
+    gpu->psLJ14ID                   = NULL;
+    gpu->psForce4                   = NULL;
+    gpu->sim.pForce4                = NULL;
+    gpu->sim.pForce4a               = NULL;
+    gpu->sim.pForce4b               = NULL;
+    gpu->psBornForce                = NULL;
+    gpu->sim.pBornForce             = NULL;
+    gpu->psBornSum                  = NULL;
+    gpu->sim.pBornSum               = NULL;
+    gpu->psBondID                   = NULL;
+    gpu->psBondParameter            = NULL;
+    gpu->psBondAngleID1             = NULL;
+    gpu->psBondAngleID2             = NULL;
+    gpu->psBondAngleParameter       = NULL;
+    gpu->psDihedralID1              = NULL;
+    gpu->psDihedralID2              = NULL;
+    gpu->psDihedralParameter        = NULL;
+    gpu->psRbDihedralID1            = NULL;
+    gpu->psRbDihedralID2            = NULL;
+    gpu->psRbDihedralParameter1     = NULL;
+    gpu->psRbDihedralParameter2     = NULL;
+    gpu->psLJ14ID                   = NULL;
+    gpu->psLJ14Parameter            = NULL;
+    gpu->psShakeID                  = NULL;
+    gpu->psShakeParameter           = NULL;
+    gpu->psExclusion                = NULL;
+    gpu->psWorkUnit                 = NULL;
+
+
+    // Initialize output buffer before reading parameters
+    gpu->pOutputBufferCounter       = new unsigned int[gpu->sim.paddedNumberOfAtoms];
+    memset(gpu->pOutputBufferCounter, 0, gpu->sim.paddedNumberOfAtoms * sizeof(unsigned int));
+    
+    // Initialize exclusion array
+    gpu->pExclusion = new unsigned int[gpu->sim.paddedNumberOfAtoms * gpu->sim.paddedNumberOfAtoms];
+    for (unsigned int i = 0; i < gpu->sim.paddedNumberOfAtoms * gpu->sim.paddedNumberOfAtoms; i++)
+        gpu->pExclusion[i] = 1;
+
+    return (void*)gpu;
+}
+
+extern "C"
+void gpuSetIntegrationParameters(gpuContext gpu, float tau, float deltaT, float temperature) {
+    gpu->sim.deltaT                 = deltaT;
+    gpu->sim.oneOverDeltaT          = 1.0f/deltaT;
+    gpu->sim.tau                    = tau;
+    gpu->sim.GDT                    = gpu->sim.deltaT / gpu->sim.tau;
+    gpu->sim.EPH                    = exp(0.5f * gpu->sim.GDT);
+    gpu->sim.EMH                    = exp(-0.5f * gpu->sim.GDT);
+    gpu->sim.EP                     = exp(gpu->sim.GDT);
+    gpu->sim.EM                     = exp(-gpu->sim.GDT);
+    gpu->sim.OneMinusEM             = 1.0f - gpu->sim.EM;
+    gpu->sim.TauOneMinusEM          = gpu->sim.tau * gpu->sim.OneMinusEM;
+    if (gpu->sim.GDT >= 0.1f)
+    {
+        float term1                 = gpu->sim.EPH - 1.0f;
+        term1                      *= term1;
+        gpu->sim.B                  = gpu->sim.GDT * (gpu->sim.EP - 1.0f) - 4.0f * term1;
+        gpu->sim.C                  = gpu->sim.GDT - 3.0f + 4.0f * gpu->sim.EMH - gpu->sim.EM;
+        gpu->sim.D                  = 2.0f - gpu->sim.EPH - gpu->sim.EMH;
+    }
+    else
+    {
+        float term1                 = 0.5f * gpu->sim.GDT;
+        float term2                 = term1 * term1;
+        float term4                 = term2 * term2;
+
+        float third                 = 1.0f / 3.0f;
+        float o7_9                  = 7.0f / 9.0f;
+        float o1_12                 = 1.0f / 12.0f;
+        float o17_90                = 17.0f / 90.0f;
+        float o7_30                 = 7.0f / 30.0f;
+        float o31_1260              = 31.0f / 1260.0f;
+        float o_360                 = 1.0f / 360.0f;
+
+        gpu->sim.B                  = term4 * (third + term1 * (third + term1 * (o17_90 + term1 * o7_9)));
+        gpu->sim.C                  = term2 * term1 * (2.0f * third + term1 * (-0.5f + term1 * (o7_30 + term1 * (-o1_12 + term1 * o31_1260))));
+        gpu->sim.D                  = term2 * (-1.0f + term2 * (-o1_12 - term2 * o_360));   
+    }
+    gpu->sim.TauDOverEMMinusOne     = gpu->sim.tau * gpu->sim.D / (gpu->sim.EM - 1.0f);
+    gpu->sim.DOverTauC              = gpu->sim.D / (gpu->sim.tau * gpu->sim.C);
+    gpu->sim.fix1                   = gpu->sim.tau * (gpu->sim.EPH - gpu->sim.EMH);
+    gpu->sim.oneOverFix1            = 1.0f / (gpu->sim.tau * (gpu->sim.EPH - gpu->sim.EMH));
+    gpu->sim.T                      = temperature;
+    gpu->sim.kT                     = BOLTZ * gpu->sim.T;
+    gpu->sim.V                      = sqrt(gpu->sim.kT * (1.0f - gpu->sim.EM));
+    gpu->sim.X                      = gpu->sim.tau * sqrt(gpu->sim.kT * gpu->sim.C);
+    gpu->sim.Yv                     = sqrt(gpu->sim.kT * gpu->sim.B / gpu->sim.C);
+    gpu->sim.Yx                     = gpu->sim.tau * sqrt(gpu->sim.kT * gpu->sim.B / (1.0f - gpu->sim.EM));
+}
+
+extern "C"
+void gpuSetVerletIntegrationParameters(gpuContext gpu, float deltaT) {
+    gpu->sim.deltaT                 = deltaT;
+    gpu->sim.oneOverDeltaT          = 1.0f/deltaT;
+}
+
+extern "C"
+void gpuSetBrownianIntegrationParameters(gpuContext gpu, float tau, float deltaT, float temperature) {
+    gpu->sim.deltaT                 = deltaT;
+    gpu->sim.oneOverDeltaT          = 1.0f/deltaT;
+    gpu->sim.tau                    = tau;
+    gpu->sim.GDT                    = gpu->sim.deltaT * gpu->sim.tau;
+    gpu->sim.T                      = temperature;
+    gpu->sim.kT                     = BOLTZ * gpu->sim.T;
+    gpu->sim.Yv = gpu->sim.Yx       = sqrt(2.0f*gpu->sim.kT*deltaT*tau);
+}
+
+extern "C"
+void gpuSetAndersenThermostatParameters(gpuContext gpu, float temperature, float collisionProbability) {
+    gpu->sim.T                      = temperature;
+    gpu->sim.kT                     = BOLTZ * gpu->sim.T;
+    gpu->sim.collisionProbability   = collisionProbability;
+    gpu->sim.Yv = gpu->sim.Yx       = 1.0f;
+    gpu->sim.V = gpu->sim.X         = 1.0f;
+}
+
+extern "C"
+void gpuShutDown(gpuContext gpu)
+{
+    // Delete sysmem pointers
+    delete[] gpu->pOutputBufferCounter;
+    delete[] gpu->pExclusion;
+    delete[] gpu->gpAtomTable;
+    delete[] gpu->pAtomSymbol;
+
+    // Delete device pointers
+    delete gpu->psPosq4;
+    delete gpu->psPosqP4;
+    delete gpu->psOldPosq4;
+    delete gpu->psVelm4;
+    delete gpu->psForce4;
+    delete gpu->psxVector4;
+    delete gpu->psvVector4;
+    delete gpu->psSigEps2; 
+    delete gpu->psObcData; 
+    delete gpu->psObcChain;
+    delete gpu->psBornForce;
+    delete gpu->psBornRadii;
+    delete gpu->psBornSum;
+    delete gpu->psBondID;
+    delete gpu->psBondParameter;
+    delete gpu->psBondAngleID1;
+    delete gpu->psBondAngleID2;
+    delete gpu->psBondAngleParameter;
+    delete gpu->psDihedralID1;
+    delete gpu->psDihedralID2;
+    delete gpu->psDihedralParameter;
+    delete gpu->psRbDihedralID1;
+    delete gpu->psRbDihedralID2;
+    delete gpu->psRbDihedralParameter1;
+    delete gpu->psRbDihedralParameter2;
+    delete gpu->psLJ14ID;
+    delete gpu->psLJ14Parameter;
+    delete gpu->psShakeID;
+    delete gpu->psShakeParameter;
+    delete gpu->psExclusion;
+    delete gpu->psWorkUnit;
+    delete gpu->psRandom4;
+    delete gpu->psRandom2;
+    delete gpu->psRandomPosition;    
+    delete gpu->psRandomSeed;
+    delete gpu->psLinearMomentum;
+
+    // Wrap up
+    delete gpu;
+    return;
+}
+
+extern "C"
+int gpuBuildOutputBuffers(gpuContext gpu)
+{
+    unsigned int outputBuffers = gpu->sim.totalNonbondOutputBuffers;
+    for (unsigned int i = 0; i < gpu->sim.paddedNumberOfAtoms; i++)
+    {
+        if (outputBuffers < gpu->pOutputBufferCounter[i])
+        {
+            outputBuffers = gpu->pOutputBufferCounter[i];
+        }
+    }    
+    gpu->sim.outputBuffers      = outputBuffers;
+    gpu->psForce4               = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, outputBuffers);
+    gpu->psBornForce            = new CUDAStream<float>(gpu->sim.paddedNumberOfAtoms, gpu->sim.nonbondOutputBuffers);
+    gpu->psBornSum              = new CUDAStream<float>(gpu->sim.paddedNumberOfAtoms, gpu->sim.nonbondOutputBuffers);
+    gpu->sim.pForce4            = gpu->psForce4->_pDevStream[0];
+    gpu->sim.pForce4a           = gpu->sim.pForce4;
+    gpu->sim.pForce4b           = gpu->sim.pForce4 + 1 * gpu->sim.nonbondOutputBuffers * gpu->sim.stride;
+    gpu->sim.pBornForce         = gpu->psBornForce->_pDevStream[0];
+    gpu->sim.pBornSum           = gpu->psBornSum->_pDevStream[0];
+
+    // Determine local energy paramter offsets for bonded interactions
+    gpu->sim.bond_offset        =                                  gpu->psBondParameter->_stride;
+    gpu->sim.bond_angle_offset  = gpu->sim.bond_offset           + gpu->psBondAngleParameter->_stride;
+    gpu->sim.dihedral_offset    = gpu->sim.bond_angle_offset     + gpu->psDihedralParameter->_stride;
+    gpu->sim.rb_dihedral_offset = gpu->sim.dihedral_offset       + gpu->psRbDihedralParameter1->_stride;
+    gpu->sim.LJ14_offset        = gpu->sim.rb_dihedral_offset    + gpu->psLJ14Parameter->_stride;
+    gpu->sim.localForces_threads_per_block  = (gpu->sim.LJ14_offset / gpu->sim.blocks + 15) & 0xfffffff0;
+    if (gpu->sim.localForces_threads_per_block > gpu->sim.max_localForces_threads_per_block)
+        gpu->sim.localForces_threads_per_block = gpu->sim.max_localForces_threads_per_block;
+    if (gpu->sim.localForces_threads_per_block < 1)
+        gpu->sim.localForces_threads_per_block = 1;
+
+    // Flip local force output buffers
+    int flip = outputBuffers - 1;
+    for (int i = 0; i < (int) gpu->sim.bonds; i++)
+    {
+        gpu->psBondID->_pSysStream[0][i].z = flip - gpu->psBondID->_pSysStream[0][i].z;
+        gpu->psBondID->_pSysStream[0][i].w = flip - gpu->psBondID->_pSysStream[0][i].w;
+    }
+    for (int i = 0; i < (int) gpu->sim.bond_angles; i++)
+    {
+        gpu->psBondAngleID1->_pSysStream[0][i].w = flip - gpu->psBondAngleID1->_pSysStream[0][i].w;
+        gpu->psBondAngleID2->_pSysStream[0][i].x = flip - gpu->psBondAngleID2->_pSysStream[0][i].x;
+        gpu->psBondAngleID2->_pSysStream[0][i].y = flip - gpu->psBondAngleID2->_pSysStream[0][i].y;
+    }
+    for (int i = 0; i < (int) gpu->sim.dihedrals; i++)
+    {
+        gpu->psDihedralID2->_pSysStream[0][i].x = flip - gpu->psDihedralID2->_pSysStream[0][i].x;
+        gpu->psDihedralID2->_pSysStream[0][i].y = flip - gpu->psDihedralID2->_pSysStream[0][i].y;
+        gpu->psDihedralID2->_pSysStream[0][i].z = flip - gpu->psDihedralID2->_pSysStream[0][i].z;
+        gpu->psDihedralID2->_pSysStream[0][i].w = flip - gpu->psDihedralID2->_pSysStream[0][i].w;
+    }
+    for (int i = 0; i < (int) gpu->sim.rb_dihedrals; i++)
+    {
+        gpu->psRbDihedralID2->_pSysStream[0][i].x = flip - gpu->psRbDihedralID2->_pSysStream[0][i].x;
+        gpu->psRbDihedralID2->_pSysStream[0][i].y = flip - gpu->psRbDihedralID2->_pSysStream[0][i].y;
+        gpu->psRbDihedralID2->_pSysStream[0][i].z = flip - gpu->psRbDihedralID2->_pSysStream[0][i].z;
+        gpu->psRbDihedralID2->_pSysStream[0][i].w = flip - gpu->psRbDihedralID2->_pSysStream[0][i].w;
+    }
+    for (int i = 0; i < (int) gpu->sim.LJ14s; i++)
+    {
+        gpu->psLJ14ID->_pSysStream[0][i].z = flip - gpu->psLJ14ID->_pSysStream[0][i].z;
+        gpu->psLJ14ID->_pSysStream[0][i].w = flip - gpu->psLJ14ID->_pSysStream[0][i].w;
+    }
+    gpu->psBondID->Upload();
+    gpu->psBondAngleID1->Upload();
+    gpu->psBondAngleID2->Upload();
+    gpu->psDihedralID2->Upload();
+    gpu->psRbDihedralID2->Upload();
+    gpu->psLJ14ID->Upload();
+
+    return 1;
+}
+
+extern "C"
+int gpuBuildThreadBlockWorkList(gpuContext gpu)
+{
+    const unsigned int atoms = gpu->sim.paddedNumberOfAtoms;
+    const unsigned int grid = gpu->grid;
+    const unsigned int dim = (atoms + (grid - 1)) / grid;
+    const unsigned int cells = dim * (dim + 1) / 2;
+    const unsigned int* pExclusion = gpu->pExclusion;
+    CUDAStream<unsigned int>* psWorkUnit = new CUDAStream<unsigned int>(cells, 1u);
+    unsigned int* pWorkList = psWorkUnit->_pSysStream[0];
+    gpu->psWorkUnit = psWorkUnit;
+    gpu->sim.pWorkUnit = psWorkUnit->_pDevStream[0];
+    gpu->sim.nonbond_workBlock      = gpu->sim.nonbond_threads_per_block / GRID;
+    gpu->sim.bornForce2_workBlock   = gpu->sim.bornForce2_threads_per_block / GRID;
+    gpu->sim.workUnits = cells;
+
+    // Increase block count if necessary for extra large molecules that would
+    // otherwise overflow the SM workunit buffers
+    int minimumBlocks = (cells + gpu->sim.workUnitsPerSM - 1) / gpu->sim.workUnitsPerSM;
+    if ((int) gpu->sim.nonbond_blocks < minimumBlocks)
+    {
+        gpu->sim.nonbond_blocks = gpu->sim.nonbond_blocks * ((minimumBlocks + gpu->sim.nonbond_blocks - 1) / gpu->sim.nonbond_blocks);
+    }
+    if ((int) gpu->sim.bornForce2_blocks < minimumBlocks)
+    {
+        gpu->sim.bornForce2_blocks = gpu->sim.bornForce2_blocks * ((minimumBlocks + gpu->sim.bornForce2_blocks - 1) / gpu->sim.bornForce2_blocks);
+    }
+    gpu->sim.nbWorkUnitsPerBlock            = cells / gpu->sim.nonbond_blocks;
+    gpu->sim.nbWorkUnitsPerBlockRemainder   = cells - gpu->sim.nonbond_blocks * gpu->sim.nbWorkUnitsPerBlock;
+    gpu->sim.bf2WorkUnitsPerBlock           = cells / gpu->sim.bornForce2_blocks;
+    gpu->sim.bf2WorkUnitsPerBlockRemainder  = cells - gpu->sim.bornForce2_blocks * gpu->sim.bf2WorkUnitsPerBlock;
+
+    // Decrease thread count for extra small molecules to spread computation
+    // across entire chip
+    int activeWorkUnits = gpu->sim.nonbond_blocks * gpu->sim.nonbond_workBlock;
+    if (activeWorkUnits > (int) cells)
+    {
+        int balancedWorkBlock                   = (cells + gpu->sim.nonbond_blocks - 1) / gpu->sim.nonbond_blocks;
+        gpu->sim.nonbond_threads_per_block      = balancedWorkBlock * GRID;
+        gpu->sim.nonbond_workBlock              = balancedWorkBlock;
+    }
+    activeWorkUnits = gpu->sim.bornForce2_blocks * gpu->sim.bornForce2_workBlock;
+    if (activeWorkUnits > (int) cells)
+    {
+        int balancedWorkBlock                   = (cells + gpu->sim.bornForce2_blocks - 1) / gpu->sim.bornForce2_blocks;
+        gpu->sim.bornForce2_threads_per_block   = balancedWorkBlock * GRID;
+        gpu->sim.bornForce2_workBlock           = balancedWorkBlock;
+    }
+
+    unsigned int count = 0;
+    for (unsigned int y = 0; y < dim; y++)
+    {
+        for (unsigned int x = y; x < dim; x++)
+        {
+            pWorkList[count] = (x << 17) | (y << 2);
+
+            // Check for exclusions
+            int exclusions = 0;
+            for (unsigned int i = y * grid; i < y * grid + grid; i++)
+            {
+                for (unsigned int j = x * grid; j < x * grid + grid; j++)
+                {
+                    if (!pExclusion[i * atoms + j])
+                    {
+                        exclusions++;
+                    }
+                }
+            }
+
+            // Signal exclusions if they exist
+            if (exclusions > 0)
+                pWorkList[count] |= 0x1;
+            count++;
+        }
+    }
+
+    psWorkUnit->Upload();
+    gpuSetConstants(gpu);
+    return cells;
+}
+
+extern "C"
+int gpuBuildExclusionList(gpuContext gpu)
+{
+    unsigned int atoms = gpu->sim.paddedNumberOfAtoms;
+    CUDAStream<unsigned int>* psExclusion = new CUDAStream<unsigned int>(atoms * atoms, 1u);
+    gpu->psExclusion = psExclusion;
+    gpu->sim.pExclusion = psExclusion->_pDevStream[0];
+    unsigned int* pExList = psExclusion->_pSysStream[0];
+    int exclusions = 0;
+    unsigned int pos = 0;    
+
+    for (unsigned int x = 0; x < atoms; x += gpu->grid)
+    {
+        for (unsigned int y = 0; y < atoms; y += gpu->grid)
+        {       
+            for (unsigned x1 = x; x1 < x + gpu->grid; x1++)
+            {
+                unsigned int mask = 0;
+                for (unsigned int y1 = y ; y1 < y + gpu->grid; y1++)
+                {
+                    mask >>= 1;
+                    if (gpu->pExclusion[x1 * atoms + y1] == 0)
+                    {
+                        if (x1 >= y1)
+                            exclusions++;
+                    }
+                    else
+                        mask |= 0x80000000;
+                }
+                pExList[pos++] = mask;
+            }
+        }
+    }
+    
+    psExclusion->Upload();
+    gpuSetConstants(gpu);
+    return exclusions;
+}
+
+extern "C"
+int gpuSetConstants(gpuContext gpu)
+{
+    SetCalculateCDLJForcesSim(gpu);
+    SetCalculateCDLJObcGbsaForces1Sim(gpu);
+    SetCalculateLocalForcesSim(gpu);
+    SetCalculateObcGbsaBornSumSim(gpu);
+    SetCalculateObcGbsaForces1Sim(gpu);
+    SetCalculateObcGbsaForces2Sim(gpu);
+    SetCalculateAndersenThermostatSim(gpu);
+    SetForcesSim(gpu);
+    SetUpdateShakeHSim(gpu);
+    SetVerletUpdateSim(gpu);
+    SetBrownianUpdateSim(gpu);
+    SetRandomSim(gpu);
+
+    if (gpu->sm_version >= SM_12)
+    {
+        SetCalculateCDLJForces_12Sim(gpu);
+        SetCalculateCDLJObcGbsaForces1_12Sim(gpu);
+        SetCalculateObcGbsaForces1_12Sim(gpu);
+        SetCalculateObcGbsaForces2_12Sim(gpu);
+    }
+
+    return 1;
+}
+
+extern "C"
+void gpuDumpCoordinates(gpuContext gpu)
+{
+    gpu->psPosq4->Download();
+    gpu->psVelm4->Download();
+    (void) printf( "\n\nCoordinates and velocities\n" );
+    for (int i = 0; i < gpu->natoms; i++)
+    {
+        printf("%4d: %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f\n", i, 
+            gpu->psPosq4->_pSysStream[0][i].x,
+            gpu->psPosq4->_pSysStream[0][i].y,
+            gpu->psPosq4->_pSysStream[0][i].z,
+            gpu->psPosq4->_pSysStream[0][i].w,
+            gpu->psVelm4->_pSysStream[0][i].x,
+            gpu->psVelm4->_pSysStream[0][i].y,
+            gpu->psVelm4->_pSysStream[0][i].z,
+            gpu->psVelm4->_pSysStream[0][i].w
+        );
+    }
+}
+
+bool ISNAN(float f)
+{
+    return !(f == f);
+}
+
+extern "C"
+bool gpuCheckData(gpuContext gpu)
+{
+    gpu->psPosq4->Download();
+    gpu->psVelm4->Download();
+    gpu->psForce4->Download();
+    gpu->psBornForce->Download();
+    int violations = 0;
+    for (int i = 0; i < gpu->natoms; i++)
+    {
+        if (ISNAN( gpu->psPosq4->_pSysStream[0][i].x) ||
+            ISNAN( gpu->psPosq4->_pSysStream[0][i].y) ||
+            ISNAN( gpu->psPosq4->_pSysStream[0][i].z) ||
+            ISNAN( gpu->psVelm4->_pSysStream[0][i].x) ||
+            ISNAN( gpu->psVelm4->_pSysStream[0][i].y) ||
+            ISNAN( gpu->psVelm4->_pSysStream[0][i].z) ||
+            ISNAN( gpu->psForce4->_pSysStream[0][i].x) ||
+            ISNAN( gpu->psForce4->_pSysStream[0][i].y) ||
+            ISNAN( gpu->psForce4->_pSysStream[0][i].z) ||
+            ISNAN( gpu->psBornForce->_pSysStream[0][i]))
+        {
+            printf("%4d: %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f\n", i, 
+                gpu->psPosq4->_pSysStream[0][i].x,
+                gpu->psPosq4->_pSysStream[0][i].y,
+                gpu->psPosq4->_pSysStream[0][i].z,
+                gpu->psVelm4->_pSysStream[0][i].x,
+                gpu->psVelm4->_pSysStream[0][i].y,
+                gpu->psVelm4->_pSysStream[0][i].z,
+                gpu->psForce4->_pSysStream[0][i].x,
+                gpu->psForce4->_pSysStream[0][i].y,
+                gpu->psForce4->_pSysStream[0][i].z,
+                gpu->psBornForce->_pSysStream[0][i]
+            );
+            violations++;
+        }
+    }
+
+
+    if (violations > 0)
+    {
+        printf("%d total violations\n", violations);
+        for (int i = 0; i < gpu->natoms; i++)
+        {
+            float dmin = 99999999.0f;
+            int closest = -9999;
+            float x = gpu->psPosq4->_pSysStream[0][i].x;
+            float y = gpu->psPosq4->_pSysStream[0][i].y;
+            float z = gpu->psPosq4->_pSysStream[0][i].z;
+            for (int j = 0; j < gpu->natoms; j++)
+            {
+                if (j != i)
+                {
+                    float dx = gpu->psPosq4->_pSysStream[0][j].x - x;
+                    float dy = gpu->psPosq4->_pSysStream[0][j].y - y;
+                    float dz = gpu->psPosq4->_pSysStream[0][j].z - z;
+                    float r = sqrt(dx * dx + dy * dy + dz * dz);
+                    if (r < dmin)
+                    {
+                        dmin = r;
+                        closest = j;
+                    }
+                }
+            }
+            printf("Atom %4d: Closest neighbor is Atom %4d, %11.5e\n", i, closest, dmin);
+        }
+
+        gpuDumpAtomData(gpu);
+
+        kClearBornForces(gpu);
+        kClearForces(gpu);
+        kCPUCalculateLocalForces(gpu);
+
+
+        // Determine which forces have gone awry
+        kClearBornForces(gpu);
+        kClearForces(gpu);
+        kCalculateCDLJForces(gpu);
+        kReduceForces(gpu);
+        printf("Nonbond Forces\n");
+        gpuDumpForces(gpu);
+
+        kClearBornForces(gpu);
+        kClearForces(gpu);
+        kCalculateObcGbsaForces1(gpu);
+        kReduceObcGbsaBornForces(gpu);
+        kCalculateObcGbsaForces2(gpu); 
+        kReduceForces(gpu);
+        printf("OBC Forces\n");
+        gpuDumpForces(gpu);
+
+        kClearBornForces(gpu);
+        kClearForces(gpu);
+        kCalculateLocalForces(gpu);
+        kReduceForces(gpu);
+        printf("Local Forces\n");
+        gpuDumpForces(gpu);
+        kClearBornForces(gpu);
+        kClearForces(gpu);
+        kReduceForces(gpu);
+        printf("Cleared Forces\n");
+        gpuDumpForces(gpu);
+        
+        return false;
+    }
+    return true;
+}
+
+extern "C"
+void kCPUCalculate14(gpuContext gpu)
+{
+    gpu->psPosq4->Download();
+    gpu->psForce4->Download();
+ //   gpu->psLJ14ID->Download();
+ //   gpu->psLJ14Parameter->Download();
+    for (int pos = 0; pos < (int) gpu->sim.LJ14s; pos++)
+    {
+        int4 atom               = gpu->psLJ14ID->_pSysStream[0][pos];
+        float4 LJ14             = gpu->psLJ14Parameter->_pSysStream[0][pos];
+        float4 a1               = gpu->psPosq4->_pSysStream[0][atom.x];
+        float4 a2               = gpu->psPosq4->_pSysStream[0][atom.y];
+        float3 d;
+        d.x                     = a1.x - a2.x;
+        d.y                     = a1.y - a2.y;
+        d.z                     = a1.z - a2.z;
+        float r2                = d.x * d.x + d.y * d.y + d.z * d.z;
+        float inverseR          = 1.0f / sqrt(r2);
+        float sig2              = inverseR * LJ14.y;
+        sig2                   *= sig2;
+        float sig6              = sig2 * sig2 * sig2;
+        float dEdR              = LJ14.x * (12.0f * sig6 - 6.0f) * sig6;
+        dEdR                   += LJ14.z * inverseR;
+        dEdR                   *= inverseR * inverseR;
+        unsigned int offsetA    = atom.x + atom.z * gpu->sim.stride;
+        unsigned int offsetB    = atom.y + atom.w * gpu->sim.stride;
+        float4 forceA           = gpu->psForce4->_pSysStream[0][offsetA];
+        float4 forceB           = gpu->psForce4->_pSysStream[0][offsetB];
+        d.x                    *= dEdR;
+        d.y                    *= dEdR;
+        d.z                    *= dEdR;
+        forceA.x               += d.x;
+        forceA.y               += d.y;
+        forceA.z               += d.z;
+        forceB.x               -= d.x;
+        forceB.y               -= d.y;
+        forceB.z               -= d.z;        
+        gpu->psForce4->_pSysStream[0][offsetA]   = forceA;
+        gpu->psForce4->_pSysStream[0][offsetB]   = forceB;
+        printf("%4d: %4d - %4d: %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f\n", pos, atom.x, atom.y, r2, dEdR, sig2, sig6, LJ14.x, LJ14.z); 
+    }        
+}
+
+
+extern "C"
+void gpuDumpPrimeCoordinates(gpuContext gpu)
+{
+    gpu->psPosqP4->Download();
+    for (int i = 0; i < gpu->natoms; i++)
+    {
+        printf("%4d: %11.5f %11.5f %11.5f %11.5f\n", i, 
+            gpu->psPosqP4->_pSysStream[0][i].x,
+            gpu->psPosqP4->_pSysStream[0][i].y,
+            gpu->psPosqP4->_pSysStream[0][i].z,
+            gpu->psPosqP4->_pSysStream[0][i].w
+        );
+    }
+}
+
+extern "C"
+void gpuDumpForces(gpuContext gpu)
+{
+    gpu->psForce4->Download();
+    gpu->psBornForce->Download();
+    for (int i = 0; i < gpu->natoms; i++)
+    {
+        char buff[512];
+        sprintf(buff, "%4d: %11.5f %11.5f %11.5f %11.5f\n", i, 
+            gpu->psForce4->_pSysStream[0][i].x,
+            gpu->psForce4->_pSysStream[0][i].y,
+            gpu->psForce4->_pSysStream[0][i].z,
+            gpu->psBornForce->_pSysStream[0][i]
+        );
+//        OutputDebugString(buff);
+    }
+}
+
+extern "C"
+void gpuDumpAtomData(gpuContext gpu)
+{
+    gpu->psPosq4->Download();
+    gpu->psSigEps2->Download();
+    gpu->psBornRadii->Download();
+    gpu->psObcChain->Download();
+    for (int i = 0; i < gpu->natoms; i++)
+    {
+        char buff[512];
+        sprintf(buff, "%4d: %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f\n", i, 
+            gpu->psPosq4->_pSysStream[0][i].x,
+            gpu->psPosq4->_pSysStream[0][i].y,
+            gpu->psPosq4->_pSysStream[0][i].z,
+            gpu->psPosq4->_pSysStream[0][i].w,
+            gpu->psSigEps2->_pSysStream[0][i].x,
+            gpu->psSigEps2->_pSysStream[0][i].y,
+            gpu->psBornRadii->_pSysStream[0][i],
+            gpu->psObcChain->_pSysStream[0][i]
+        );
+//        OutputDebugString((LPCWSTR)buff);
+    }
+}
+
+
+extern "C"
+void gpuSetup(void* pVoid)
+{            
+    gpuContext gpu = (gpuContext)pVoid;
+    // Read parameters
+    cout << gpuReadAtomicParameters(gpu, "Data/atomicradii.txt") << " atom types\n";
+    cout << gpuReadBondParameters(gpu, "Data/GromacsHarmonicBondParameter.txt") << " bond parameters.\n";
+    cout << gpuReadBondAngleParameters(gpu, "Data/GromacsAngleBondParameter.txt") << " bond angle parameters.\n";
+    cout << gpuReadDihedralParameters(gpu, "Data/GromacsProperDihedralParameter.txt") << " proper dihedral parameters.\n";
+    cout << gpuReadRbDihedralParameters(gpu, "Data/GromacsRbDihedralParameter.txt") << " Ryckaert-Bellemans dihedral parameters.\n";
+    cout << gpuReadLJ14Parameters(gpu, "Data/GromacsLJ14Parameter.txt") << " Lennard-Jones 1-4 parameters.\n";
+    cout << gpuReadCoulombParameters(gpu, "Data/GromacsLJCoulombParameter.txt") << " Coulomb parameters.\n";
+    cout << gpuReadShakeParameters(gpu, "Data/GromacsShakeParameters.txt") << " shake parameters.\n";
+
+    // Build thread block work list
+    gpuBuildThreadBlockWorkList(gpu);
+
+    // Build exclusion list
+    gpuBuildExclusionList(gpu);
+    
+    // Create output buffers
+    gpuBuildOutputBuffers(gpu);
+
+    // Set constant blocks
+    gpuSetConstants(gpu);
+
+    // Initialize randoms
+    gpuInitializeRandoms(gpu);
+
+    // Initialize Born Radii;
+    kCalculateObcGbsaBornSum(gpu);
+    kReduceObcGbsaBornSum(gpu);
+    kClearForces(gpu);
+    kClearBornForces(gpu);
+    return;
+}
+
+
+#define DOT3(v1, v2) (v1.x * v2.x + v1.y * v2.y + v1.z * v2.z)
+
+#define GETNORMEDDOTPRODUCT(v1, v2, dp) \
+{ \
+    dp          = DOT3(v1, v2); \
+    float norm1 = DOT3(v1, v1); \
+    float norm2 = DOT3(v2, v2); \
+    dp /= sqrt(norm1 * norm2); \
+    dp = min(dp, 1.0f); \
+    dp = max(dp, -1.0f); \
+}
+
+#define CROSS_PRODUCT(v1, v2, c) \
+    c.x = v1.y * v2.z - v1.z * v2.y; \
+    c.y = v1.z * v2.x - v1.x * v2.z; \
+    c.z = v1.x * v2.y - v1.y * v2.x;
+
+#define GETPREFACTORSGIVENANGLECOSINE(cosine, param, dEdR) \
+{ \
+   float angle          = acos(cosine); \
+   float deltaIdeal     = angle - (param.x * (3.14159265f / 180.0f)); \
+   dEdR                 = param.y * deltaIdeal; \
+}
+
+#define GETANGLEBETWEENTWOVECTORS(v1, v2, angle) \
+{ \
+    float dp; \
+    GETNORMEDDOTPRODUCT(v1, v2, dp); \
+    angle = acos(dp); \
+}
+
+#define GETANGLECOSINEBETWEENTWOVECTORS(v1, v2, angle, cosine) \
+{ \
+    GETNORMEDDOTPRODUCT(v1, v2, cosine); \
+    angle = acos(cosine); \
+}
+
+#define GETDIHEDRALANGLEBETWEENTHREEVECTORS(vector1, vector2, vector3, signVector, cp0, cp1, angle) \
+{ \
+    CROSS_PRODUCT(vector1, vector2, cp0); \
+    CROSS_PRODUCT(vector2, vector3, cp1); \
+    GETANGLEBETWEENTWOVECTORS(cp0, cp1, angle); \
+    float dp = DOT3(signVector, cp1); \
+    angle = (dp >= 0) ? angle : -angle; \
+}                                                          
+
+#define GETDIHEDRALANGLECOSINEBETWEENTHREEVECTORS(vector1, vector2, vector3, signVector, cp0, cp1, angle, cosine) \
+{ \
+    CROSS_PRODUCT(vector1, vector2, cp0); \
+    CROSS_PRODUCT(vector2, vector3, cp1); \
+    GETANGLECOSINEBETWEENTWOVECTORS(cp0, cp1, angle, cosine); \
+    float dp = DOT3(signVector, cp1); \
+    angle = (dp >= 0) ? angle : -angle; \
+}    
+
+// Calculate Local forces on CPU
+extern "C"
+void kCPUCalculateLocalForces(gpuContext gpu)
+{
+    gpu->psPosq4->Download();
+    gpu->psForce4->Download();
+    gpu->psBondID->Download();
+    gpu->psBondParameter->Download();
+    gpu->psBondAngleID1->Download();
+    gpu->psBondAngleID2->Download();
+    gpu->psBondAngleParameter->Download();
+    gpu->psDihedralID1->Download();
+    gpu->psDihedralID2->Download();
+    gpu->psDihedralParameter->Download();
+    gpu->psRbDihedralID1->Download();
+    gpu->psRbDihedralID2->Download();
+    gpu->psRbDihedralParameter1->Download();
+    gpu->psRbDihedralParameter2->Download();
+    gpu->psLJ14ID->Download();
+    gpu->psLJ14Parameter->Download();
+
+    unsigned int pos = 0;
+    Vectors V;
+    Vectors* A = &V;
+    int violations = 0;
+
+    while (pos < gpu->sim.bond_offset)
+    {
+        if (pos < gpu->sim.bonds)
+        {
+            int4   atom         = gpu->psBondID->_pSysStream[0][pos];
+            float4 atomA        = gpu->psPosq4->_pSysStream[0][atom.x];
+            float4 atomB        = gpu->psPosq4->_pSysStream[0][atom.y];
+            float2 bond         = gpu->psBondParameter->_pSysStream[0][pos];
+            float dx            = atomB.x - atomA.x;
+            float dy            = atomB.y - atomA.y;
+            float dz            = atomB.z - atomA.z;
+            float r2            = dx * dx + dy * dy + dz * dz;
+            float r             = sqrt(r2);
+            float deltaIdeal    = r - bond.x;
+            float dEdR          = bond.y * deltaIdeal;
+            dEdR                = (r > 0.0f) ? (dEdR / r) : 0.0f;
+            if (fabs(deltaIdeal) > 1.0f)
+            {
+                printf("Bond %4d: %11.4f %11.4f %11.4f %11.4f %11.4f %11.4f\n", pos, dx, dy, dz, r, deltaIdeal, dEdR);
+                violations++;
+            }
+            dx                 *= dEdR;
+            dy                 *= dEdR;
+            dz                 *= dEdR;
+            unsigned int offsetA                = atom.x + atom.z * gpu->sim.stride;
+            unsigned int offsetB                = atom.y + atom.w * gpu->sim.stride;
+            float4 forceA                       = gpu->psForce4->_pSysStream[0][offsetA];
+            float4 forceB                       = gpu->psForce4->_pSysStream[0][offsetB];
+            forceA.x                           += dx;
+            forceA.y                           += dy;
+            forceA.z                           += dz;
+            forceB.x                           -= dx;
+            forceB.y                           -= dy;
+            forceB.z                           -= dz;
+            gpu->psForce4->_pSysStream[0][offsetA]               = forceA;
+            gpu->psForce4->_pSysStream[0][offsetB]               = forceB;    
+        }
+        pos++;
+    }
+#if 0  
+    while (pos < gpu->sim.bond_angle_offset)
+    {
+        unsigned int pos1   = pos - gpu->sim.bond_offset;
+        if (pos1 < gpu->sim.bond_angles)
+        {
+            int4   atom1        = gpu->psBondAngleID1->_pSysStream[0][pos1];  
+            float2 bond_angle   = gpu->psBondAngleParameter->_pSysStream[0][pos1];
+            float4 a1           = gpu->psPosq4->_pSysStream[0][atom1.x];
+            float4 a2           = gpu->psPosq4->_pSysStream[0][atom1.y];
+            float4 a3           = gpu->psPosq4->_pSysStream[0][atom1.z];
+            A->v0.x = a2.x - a1.x;
+            A->v0.y = a2.y - a1.y;
+            A->v0.z = a2.z - a1.z;
+            A->v1.x = a2.x - a3.x;
+            A->v1.y = a2.y - a3.y;
+            A->v1.z = a2.z - a3.z;
+            float3 cp;
+            CROSS_PRODUCT(A->v0, A->v1, cp);
+            float rp = DOT3(cp, cp); //cx * cx + cy * cy + cz * cz;
+            rp = max(sqrt(rp), 1.0e-06f);
+            float r21       = DOT3(A->v0, A->v0); // dx1 * dx1 + dy1 * dy1 + dz1 * dz1;
+            float r23       = DOT3(A->v1, A->v1); // dx2 * dx2 + dy2 * dy2 + dz2 * dz2;
+            float dot       = DOT3(A->v0, A->v1); // dx1 * dx2 + dy1 * dy2 + dz1 * dz2;
+            float cosine    = dot / sqrt(r21 * r23);
+            float dEdR;
+            GETPREFACTORSGIVENANGLECOSINE(cosine, bond_angle, dEdR);
+            printf("Bond angle %4d %11.4f %11.4f\n", pos1, cosine, dEdR);
+            float termA =  dEdR / (r21 * rp);
+            float termC = -dEdR / (r23 * rp);
+            float3 c21;
+            float3 c23;
+            CROSS_PRODUCT(A->v0, cp, c21);
+            CROSS_PRODUCT(A->v1, cp, c23);
+            c21.x *= termA;
+            c21.y *= termA;
+            c21.z *= termA;
+            c23.x *= termC;
+            c23.y *= termC;
+            c23.z *= termC;
+            int2 atom2 = gpu->psBondAngleID2->_pSysStream[0][pos1];
+            unsigned int offset = atom1.x + atom1.w * gpu->sim.stride;
+            float4 force = gpu->psForce4->_pSysStream[0][offset]; 
+            force.x += c21.x;
+            force.y += c21.y;
+            force.z += c21.z;
+            gpu->psForce4->_pSysStream[0][offset] = force;
+            offset = atom1.y + atom2.x * gpu->sim.stride;
+            force = gpu->psForce4->_pSysStream[0][offset];
+            force.x -= (c21.x + c23.x);
+            force.y -= (c21.y + c23.y);
+            force.z -= (c21.z + c23.z);
+            gpu->psForce4->_pSysStream[0][offset] = force;
+            offset = atom1.z + atom2.y * gpu->sim.stride;
+            force = gpu->psForce4->_pSysStream[0][offset];
+            force.x += c23.x;
+            force.y += c23.y;
+            force.z += c23.z;
+            gpu->psForce4->_pSysStream[0][offset] = force;
+        }
+        pos++;
+    }
+            
+    while (pos < gpu->sim.dihedral_offset)
+    {
+        unsigned int pos1 = pos - gpu->sim.bond_angle_offset;
+        if (pos1 < gpu->sim.dihedrals)
+        {
+            int4   atom1        = gpu->psDihedralID1->_pSysStream[0][pos1];  
+            float4 atomA        = gpu->psPosq4->_pSysStream[0][atom1.x];
+            float4 atomB        = gpu->psPosq4->_pSysStream[0][atom1.y];
+            float4 atomC        = gpu->psPosq4->_pSysStream[0][atom1.z];
+            float4 atomD        = gpu->psPosq4->_pSysStream[0][atom1.w];            
+            A->v0.x             = atomA.x - atomB.x;
+            A->v0.y             = atomA.y - atomB.y;
+            A->v0.z             = atomA.z - atomB.z;
+            A->v1.x             = atomC.x - atomB.x;
+            A->v1.y             = atomC.y - atomB.y;
+            A->v1.z             = atomC.z - atomB.z;
+            A->v2.x             = atomC.x - atomD.x;
+            A->v2.y             = atomC.y - atomD.y;
+            A->v2.z             = atomC.z - atomD.z; 
+            float3 cp0, cp1;
+            float dihedralAngle;
+            GETDIHEDRALANGLEBETWEENTHREEVECTORS(A->v0, A->v1, A->v2, A->v0, cp0, cp1, dihedralAngle);
+            float4 dihedral         = gpu->psDihedralParameter->_pSysStream[0][pos1];
+            float deltaAngle        = dihedral.z * dihedralAngle - (dihedral.y * 3.14159265f / 180.0f);
+            float sinDeltaAngle     = sin(deltaAngle);
+            float dEdAngle          = -dihedral.x * dihedral.z * sinDeltaAngle;
+            float normCross1        = DOT3(cp0, cp0);
+            float normBC            = sqrt(DOT3(A->v1, A->v1));
+            float4 ff;
+            ff.x                    = (-dEdAngle * normBC) / normCross1;
+            float normCross2        = DOT3(cp1, cp1);
+            ff.w                    = (dEdAngle * normBC) / normCross2;
+            float dp                = 1.0f / DOT3(A->v1, A->v1);
+            ff.y                    = DOT3(A->v0, A->v1) * dp;
+            ff.z                    = DOT3(A->v2, A->v1) * dp;
+            int4  atom2             = gpu->psDihedralID2->_pSysStream[0][pos1];   
+            float3 internalF0;
+            float3 internalF3;
+            float3 s;
+            
+//            printf("%4d: %9.4f %9.4f %9.4f %9.4f\n", pos1, ff.x, ff.y, ff.z, ff.w);  
+            unsigned int offset                 = atom1.x + atom2.x * gpu->sim.stride;
+            float4 force                        = gpu->psForce4->_pSysStream[0][offset];
+            internalF0.x                        = ff.x * cp0.x; 
+            force.x                            += internalF0.x;
+            internalF0.y                        = ff.x * cp0.y;
+            force.y                            += internalF0.y;
+            internalF0.z                        = ff.x * cp0.z;       
+            force.z                            += internalF0.z;
+            gpu->psForce4->_pSysStream[0][offset]                = force;
+            
+            printf("Dihedral %4d - 0: %9.4f %9.4f %9.4f\n", pos1, gpu->psForce4->_pSysStream[0][offset], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride2]);
+            offset                              = atom1.w + atom2.w * gpu->sim.stride;
+            force                               = gpu->psForce4->_pSysStream[0][offset];
+            internalF3.x                        = ff.w * cp1.x;
+            force.x                            += internalF3.x;
+            internalF3.y                        = ff.w * cp1.y;
+            force.y                            += internalF3.y;
+            internalF3.z                        = ff.w * cp1.z;
+            force.z                            += internalF3.z;
+            gpu->psForce4->_pSysStream[0][offset]                = force;
+            
+            printf("Dihedral %4d - 3: %9.4f %9.4f %9.4f\n", pos1, gpu->psForce4->_pSysStream[0][offset], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride2]);
+            s.x                                 = ff.y * internalF0.x - ff.z * internalF3.x;   
+            s.y                                 = ff.y * internalF0.y - ff.z * internalF3.y;  
+            s.z                                 = ff.y * internalF0.z - ff.z * internalF3.z;        
+            offset                              = atom1.y + atom2.y * gpu->sim.stride;
+            force                               = gpu->psForce4->_pSysStream[0][offset];
+            force.x                            += -internalF0.x + s.x;
+            force.y                            += -internalF0.y + s.y;
+            force.z                            += -internalF0.z + s.z;
+            gpu->psForce4->_pSysStream[0][offset]                = force;
+            
+            printf("Dihedral %4d - 1: %9.4f %9.4f %9.4f\n", pos1, gpu->psForce4->_pSysStream[0][offset], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride2]);
+            offset                              = atom1.z + atom2.z * gpu->sim.stride;
+            force                               = gpu->psForce4->_pSysStream[0][offset];
+            force.x                            += -internalF3.x - s.x;
+            force.y                            += -internalF3.y - s.y;
+            force.z                            += -internalF3.z - s.z;
+            gpu->psForce4->_pSysStream[0][offset]                = force;
+            printf("Dihedral %4d - 2: %9.4f %9.4f %9.4f\n", pos1, gpu->psForce4->_pSysStream[0][offset], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride2]);
+        }        
+        pos++;
+    }
+
+    while (pos < gpu->sim.rb_dihedral_offset)
+    {
+        unsigned int pos1 = pos - gpu->sim.dihedral_offset;
+        if (pos1 < gpu->sim.rb_dihedrals)
+        {
+            int4   atom1        = gpu->psRbDihedralID1->_pSysStream[0][pos1];  
+            float4 atomA        = gpu->psPosq4->_pSysStream[0][atom1.x];
+            float4 atomB        = gpu->psPosq4->_pSysStream[0][atom1.y];
+            float4 atomC        = gpu->psPosq4->_pSysStream[0][atom1.z];
+            float4 atomD        = gpu->psPosq4->_pSysStream[0][atom1.w];            
+            A->v0.x             = atomA.x - atomB.x;
+            A->v0.y             = atomA.y - atomB.y;
+            A->v0.z             = atomA.z - atomB.z;
+            A->v1.x             = atomC.x - atomB.x;
+            A->v1.y             = atomC.y - atomB.y;
+            A->v1.z             = atomC.z - atomB.z;
+            A->v2.x             = atomC.x - atomD.x;
+            A->v2.y             = atomC.y - atomD.y;
+            A->v2.z             = atomC.z - atomD.z; 
+            float3 cp0, cp1;
+            float dihedralAngle, cosPhi;
+      //      printf("%4d - 0 : %9.4f %9.4f %9.4f\n", pos1, A->v0.x, A->v0.y, A->v0.z); 
+      //      printf("%4d - 1 : %9.4f %9.4f %9.4f\n", pos1, A->v1.x, A->v1.y, A->v1.z); 
+      //      printf("%4d - 2 : %9.4f %9.4f %9.4f\n", pos1, A->v2.x, A->v2.y, A->v2.z);  
+            GETDIHEDRALANGLECOSINEBETWEENTHREEVECTORS(A->v0, A->v1, A->v2, A->v0, cp0, cp1, dihedralAngle, cosPhi);
+            if (dihedralAngle < 0.0f )
+            {
+                dihedralAngle += 3.14159265f;
+            } 
+            else 
+            {
+                dihedralAngle -= 3.14159265f;
+            }
+            cosPhi                  = -cosPhi;
+         //   printf("%4d: %9.4f %9.4f\n", pos1, dihedralAngle, cosPhi);
+            float4 dihedral1        = gpu->psRbDihedralParameter1->_pSysStream[0][pos1];
+            float2 dihedral2        = gpu->psRbDihedralParameter2->_pSysStream[0][pos1];
+            float cosFactor         = cosPhi;
+            float dEdAngle          = -dihedral1.y;
+        //    printf("%4d - 1: %9.4f %9.4f\n", pos1, dEdAngle, 1.0f);
+            dEdAngle               -= 2.0f * dihedral1.z * cosFactor;
+       //     printf("%4d - 2: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
+            cosFactor              *= cosPhi;
+            dEdAngle               -= 3.0f * dihedral1.w * cosFactor;
+     //       printf("%4d - 3: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
+            cosFactor              *= cosPhi;
+            dEdAngle               -= 4.0f * dihedral2.x * cosFactor;
+   //         printf("%4d - 4: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
+            cosFactor              *= cosPhi;
+            dEdAngle               -= 5.0f * dihedral2.y * cosFactor;
+ //           printf("%4d - 5: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
+            dEdAngle               *= sin(dihedralAngle);  
+//            printf("%4d - f: %9.4f\n", pos1, dEdAngle);
+            
+            float normCross1        = DOT3(cp0, cp0);
+            float normBC            = sqrt(DOT3(A->v1, A->v1));
+            float4 ff;
+            ff.x                    = (-dEdAngle * normBC) / normCross1;
+            float normCross2        = DOT3(cp1, cp1);
+            ff.w                    = (dEdAngle * normBC) / normCross2;
+            float dp                = 1.0f / DOT3(A->v1, A->v1);
+            ff.y                    = DOT3(A->v0, A->v1) * dp;
+            ff.z                    = DOT3(A->v2, A->v1) * dp;
+            int4  atom2             = gpu->psRbDihedralID2->_pSysStream[0][pos1];   
+            float3 internalF0;
+            float3 internalF3;
+            float3 s;
+            
+            printf("RB Dihedral %4d: %9.4f %9.4f %9.4f %9.4f\n", pos1, ff.x, ff.y, ff.z, ff.w);  
+            unsigned int offset                 = atom1.x + atom2.x * gpu->sim.stride;
+            float4 force                        = gpu->psForce4->_pSysStream[0][offset];
+            internalF0.x                        = ff.x * cp0.x; 
+            force.x                            += internalF0.x;
+            internalF0.y                        = ff.x * cp0.y;
+            force.y                            += internalF0.y;
+            internalF0.z                        = ff.x * cp0.z;       
+            force.z                            += internalF0.z;
+            gpu->psForce4->_pSysStream[0][offset]                = force;
+            
+            printf("RB Dihedral %4d - 0: %9.4f %9.4f %9.4f\n", pos1, gpu->psForce4->_pSysStream[0][offset], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride2]);
+            offset                              = atom1.w + atom2.w * gpu->sim.stride;
+            force                               = gpu->psForce4->_pSysStream[0][offset];
+            internalF3.x                        = ff.w * cp1.x;
+            force.x                            += internalF3.x;
+            internalF3.y                        = ff.w * cp1.y;
+            force.y                            += internalF3.y;
+            internalF3.z                        = ff.w * cp1.z;
+            force.z                            += internalF3.z;
+            gpu->psForce4->_pSysStream[0][offset]                = force;
+            
+            printf("RB Dihedral %4d - 3: %9.4f %9.4f %9.4f\n", pos1, gpu->psForce4->_pSysStream[0][offset], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride2]);
+            s.x                                 = ff.y * internalF0.x - ff.z * internalF3.x;   
+            s.y                                 = ff.y * internalF0.y - ff.z * internalF3.y;  
+            s.z                                 = ff.y * internalF0.z - ff.z * internalF3.z;        
+            offset                              = atom1.y + atom2.y * gpu->sim.stride;
+            force                               = gpu->psForce4->_pSysStream[0][offset];
+            force.x                            += -internalF0.x + s.x;
+            force.y                            += -internalF0.y + s.y;
+            force.z                            += -internalF0.z + s.z;
+            gpu->psForce4->_pSysStream[0][offset]                = force;
+            printf("RB Dihedral %4d - 1: %9.4f %9.4f %9.4f\n", pos1, gpu->psForce4->_pSysStream[0][offset], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride2]);
+            offset                              = atom1.z + atom2.z * gpu->sim.stride;
+            force                               = gpu->psForce4->_pSysStream[0][offset];
+            force.x                            += -internalF3.x - s.x;
+            force.y                            += -internalF3.y - s.y;
+            force.z                            += -internalF3.z - s.z;
+            gpu->psForce4->_pSysStream[0][offset]                = force;
+     //       printf("%4d - 2: %9.4f %9.4f %9.4f\n", pos1, gpu->psForce4->_pSysStream[0][offset], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride2]);
+        }            
+        pos++;
+    }   
+
+    while (pos < gpu->sim.LJ14_offset)
+    {  
+        unsigned int pos1       = pos - gpu->sim.rb_dihedral_offset;
+        if (pos1 < gpu->sim.LJ14s)
+        {
+            int4 atom               = gpu->psLJ14ID->_pSysStream[0][pos1];
+            float4 LJ14             = gpu->psLJ14Parameter->_pSysStream[0][pos1];
+            float4 a1               = gpu->psPosq4->_pSysStream[0][atom.x];
+            float4 a2               = gpu->psPosq4->_pSysStream[0][atom.y];
+            float3 d;
+            d.x                     = a1.x - a2.x;
+            d.y                     = a1.y - a2.y;
+            d.z                     = a1.z - a2.z;
+            float r2                = DOT3(d, d);
+            float inverseR          = 1.0f / sqrt(r2);
+            float sig2              = inverseR * LJ14.y;
+            sig2                   *= sig2;
+            float sig6              = sig2 * sig2 * sig2;
+            float dEdR              = LJ14.x * (12.0f * sig6 - 6.0f) * sig6;
+            dEdR                   += LJ14.z * inverseR;
+            dEdR                   *= inverseR * inverseR;
+            unsigned int offsetA    = atom.x + atom.z * gpu->sim.stride;
+            unsigned int offsetB    = atom.y + atom.w * gpu->sim.stride;
+            float4 forceA           = gpu->psForce4->_pSysStream[0][offsetA];
+            float4 forceB           = gpu->psForce4->_pSysStream[0][offsetB];
+            d.x                    *= dEdR;
+            d.y                    *= dEdR;
+            d.z                    *= dEdR;
+            forceA.x               += d.x;
+            forceA.y               += d.y;
+            forceA.z               += d.z;
+            forceB.x               -= d.x;
+            forceB.y               -= d.y;
+            forceB.z               -= d.z;        
+            printf("LJ14 %d: %11.4f %11.4f %11.4f\n", pos1, d.x, d.y, d.z);
+            gpu->psForce4->_pSysStream[0][offsetA]   = forceA;
+            gpu->psForce4->_pSysStream[0][offsetB]   = forceB;
+        }        
+        pos++;
+    }
+#endif
+
+    if (violations > 0)
+    {
+        gpuDumpCoordinates(gpu);
+        gpuDumpForces(gpu);
+    }
+}
+
+static FILE* getWriteToFilePtr( char* fname, int step )
+{
+   std::stringstream fileName;
+   fileName << fname << "_";
+   fileName << step;
+   fileName << ".txt";
+   FILE* filePtr = fopen( fileName.str().c_str(), "w" );
+   if( filePtr == NULL ){
+      (void) fprintf( stderr, "Could not open file=<%s> for writitng.", fileName.str().c_str() );
+      exit(-1);
+   }
+   return filePtr;
+}
+
+extern "C" {
+static void printValues( FILE* filePtr, int index, int numberOfValues, float* values )
+{
+   int i;
+   (void) fprintf( filePtr, "%5d ", index );
+   for ( i = 0; i < numberOfValues; i++ ) { 
+      (void) fprintf( filePtr, " %18.10e", values[i] );
+   }
+   (void) fprintf( filePtr, "\n" );
+   (void) fflush( filePtr );
+} 
+}
+
+extern "C"
+void WriteArrayToFile1( gpuContext gpu, char* fname, int step, CUDAStream<float>* psPos, int numPrint )
+{
+   int i;
+   static const int numberOfValues = 1;
+   FILE* filePtr = getWriteToFilePtr( fname, step );
+   float values[numberOfValues];
+   psPos->Download();
+
+   numPrint = (numPrint > 0 && (numPrint < gpu->natoms)) ? numPrint : gpu->natoms;
+   for ( i = 0; i < numPrint; i++ ) { 
+      values[0] = psPos->_pSysStream[0][i];
+      printValues( filePtr, i, numberOfValues, values ); 
+   }
+   for ( i = gpu->natoms - numPrint; i < gpu->natoms; i++ ) { 
+      values[0] = psPos->_pSysStream[0][i];
+      printValues( filePtr, i, numberOfValues, values ); 
+   }
+   (void) fclose( filePtr );
+}
+
+extern "C"
+void WriteArrayToFile2( gpuContext gpu, char* fname, int step, CUDAStream<float2>* psPos, int numPrint )
+{
+   int i;
+   static const int numberOfValues = 2;
+   FILE* filePtr = getWriteToFilePtr( fname, step );
+   float values[numberOfValues];
+   psPos->Download();
+
+   numPrint = (numPrint > 0 && (numPrint < gpu->natoms)) ? numPrint : gpu->natoms;
+   for ( i = 0; i < numPrint; i++ ) { 
+      values[0] = psPos->_pSysStream[0][i].x;
+      values[1] = psPos->_pSysStream[0][i].y;
+      printValues( filePtr, i, numberOfValues, values ); 
+   }
+   for ( i = gpu->natoms - numPrint; i < gpu->natoms; i++ ) { 
+      values[0] = psPos->_pSysStream[0][i].x;
+      values[1] = psPos->_pSysStream[0][i].y;
+      printValues( filePtr, i, numberOfValues, values ); 
+   }
+   (void) fclose( filePtr );
+}
+
+extern "C"
+void WriteArrayToFile4( gpuContext gpu, char* fname, int step, CUDAStream<float4>* psPos, int numPrint )
+{
+   int i;
+   static const int numberOfValues = 4;
+   FILE* filePtr = getWriteToFilePtr( fname, step );
+   float values[numberOfValues];
+   psPos->Download();
+
+   numPrint = (numPrint > 0 && (numPrint < gpu->natoms)) ? numPrint : gpu->natoms;
+   for ( i = 0; i < numPrint; i++ ) { 
+      values[0] = psPos->_pSysStream[0][i].x;
+      values[1] = psPos->_pSysStream[0][i].y;
+      values[2] = psPos->_pSysStream[0][i].z;
+      values[3] = psPos->_pSysStream[0][i].w;
+      printValues( filePtr, i, numberOfValues, values ); 
+   }
+   for ( i = gpu->natoms - numPrint; i < gpu->natoms; i++ ) { 
+      values[0] = psPos->_pSysStream[0][i].x;
+      values[1] = psPos->_pSysStream[0][i].y;
+      values[2] = psPos->_pSysStream[0][i].z;
+      values[3] = psPos->_pSysStream[0][i].w;
+      printValues( filePtr, i, numberOfValues, values ); 
+   }
+   (void) fclose( filePtr );
+}
+
+extern "C"
+void gpuDumpObcInfo(gpuContext gpu)
+{
+    gpu->psPosq4->Download();
+    gpu->psBornRadii->Download();
+    gpu->psObcData->Download();
+    gpu->psBornSum->Download();
+    printf( "\n\nObc Info xyzw Brad atomR scaledAtomR\n" );
+    for (int i = 0; i < gpu->natoms; i++)
+    {
+        printf("%4d: %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f\n", i, 
+            gpu->psPosq4->_pSysStream[0][i].x,
+            gpu->psPosq4->_pSysStream[0][i].y,
+            gpu->psPosq4->_pSysStream[0][i].z,
+            gpu->psPosq4->_pSysStream[0][i].w,
+            gpu->psBornRadii->_pSysStream[0][i],
+            gpu->psBornSum->_pSysStream[0][i],
+            gpu->psObcData->_pSysStream[0][i].x,
+            gpu->psObcData->_pSysStream[0][i].y
+        );
+    }
+}
+
+extern "C"
+void gpuDumpObcLoop1(gpuContext gpu)
+{
+    float compF;
+    gpu->psForce4->Download();
+    gpu->psBornRadii->Download();
+    gpu->psBornForce->Download();
+    gpu->psObcChain->Download();
+    gpu->psBornSum->Download();
+    printf( "\n\nObc F3 BrnR BrnF Chn\n" );
+    for (int i = 0; i < gpu->natoms; i++)
+    {
+	     compF = gpu->psBornForce->_pSysStream[0][i]/(gpu->psBornRadii->_pSysStream[0][i]*gpu->psBornRadii->_pSysStream[0][i]*gpu->psObcChain->_pSysStream[0][i]);
+        printf("%4d: %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f\n", i, 
+            gpu->psForce4->_pSysStream[0][i].x,
+            gpu->psForce4->_pSysStream[0][i].y,
+            gpu->psForce4->_pSysStream[0][i].z,
+//            gpu->psForce4->_pSysStream[0][i].w,
+            gpu->psBornRadii->_pSysStream[0][i],
+				compF,
+            gpu->psBornForce->_pSysStream[0][i],
+//            gpu->psBornSum->_pSysStream[0][i],
+            gpu->psObcChain->_pSysStream[0][i]
+        );
+    }
+}
--- a/platforms/cuda/src/kernels/gputypes.h
+++ b/platforms/cuda/src/kernels/gputypes.h
+#ifndef __GPUTYPES_H__
+#define __GPUTYPES_H__
+
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "cudatypes.h"
+#include <vector>
+
+struct gpuAtomType {
+    string name;
+    char symbol;
+    float r;
+};
+
+enum SM_VERSION
+{
+    SM_10,
+    SM_11,
+    SM_12
+};
+
+
+/* Pointer to this structure will be given 
+ * to gromacs functions*/
+struct _gpuContext {
+    
+    
+    //Cache this here so that it doesn't
+    //have to be repeatedly passed around
+    int natoms;
+    gpuAtomType* gpAtomTable;
+    int gAtomTypes;
+    cudaGmxSimulation sim;
+    unsigned int* pOutputBufferCounter;
+    unsigned int* pExclusion;
+    unsigned char* pAtomSymbol;
+    float iterations;
+    float epsfac;
+    float solventDielectric;
+    float soluteDielectric;
+    int grid;
+    bool bCalculateCM;
+    bool bRemoveCM;
+	 bool bRecalculateBornRadii;
+    unsigned long seed;
+    SM_VERSION sm_version;
+    CUDAStream<float4>* psPosq4;
+    CUDAStream<float4>* psPosqP4;
+    CUDAStream<float4>* psOldPosq4;
+    CUDAStream<float4>* psVelm4;
+    CUDAStream<float4>* psForce4;
+    CUDAStream<float4>* psxVector4;
+    CUDAStream<float4>* psvVector4;
+    CUDAStream<float2>* psSigEps2; 
+    CUDAStream<float2>* psObcData; 
+    CUDAStream<float>* psObcChain;
+    CUDAStream<float>* psBornForce;
+    CUDAStream<float>* psBornRadii;
+    CUDAStream<float>* psBornSum;
+    CUDAStream<int4>* psBondID;
+    CUDAStream<float2>* psBondParameter;
+    CUDAStream<int4>* psBondAngleID1;
+    CUDAStream<int2>* psBondAngleID2;
+    CUDAStream<float2>* psBondAngleParameter;
+    CUDAStream<int4>* psDihedralID1;
+    CUDAStream<int4>* psDihedralID2;
+    CUDAStream<float4>* psDihedralParameter;
+    CUDAStream<int4>* psRbDihedralID1;
+    CUDAStream<int4>* psRbDihedralID2;
+    CUDAStream<float4>* psRbDihedralParameter1;
+    CUDAStream<float2>* psRbDihedralParameter2;
+    CUDAStream<int4>* psLJ14ID;
+    CUDAStream<float4>* psLJ14Parameter;
+    CUDAStream<int>* psNonShakeID;
+    CUDAStream<int4>* psShakeID;
+    CUDAStream<float4>* psShakeParameter;
+    CUDAStream<unsigned int>* psExclusion;
+    CUDAStream<unsigned int>* psWorkUnit;
+    CUDAStream<float4>* psRandom4;          // Pointer to sets of 4 random numbers for MD integration
+    CUDAStream<float2>* psRandom2;          // Pointer to sets of 2 random numbers for MD integration
+    CUDAStream<uint4>* psRandomSeed;        // Pointer to each random seed
+    CUDAStream<int>* psRandomPosition;      // Pointer to random number positions
+    CUDAStream<float4>* psLinearMomentum;   // Pointer to total linear momentum per CTA
+   
+
+};
+
+typedef struct _gpuContext *gpuContext;
+
+
+// Function prototypes
+extern "C"
+bool gpuIsAvailable();
+
+extern "C"
+int gpuReadBondParameters(gpuContext gpu, char* fname);
+
+extern "C"
+void gpuSetBondParameters(gpuContext gpu, const std::vector<int>& atom1, const std::vector<int>& atom2, const std::vector<float>& length, const std::vector<float>& k);
+
+extern "C"
+int gpuReadBondAngleParameters(gpuContext gpu, char* fname);
+
+extern "C"
+void gpuSetBondAngleParameters(gpuContext gpu, const std::vector<int>& atom1, const std::vector<int>& atom2, const std::vector<int>& atom3,
+        const std::vector<float>& angle, const std::vector<float>& k);
+
+extern "C"
+int gpuReadDihedralParameters(gpuContext gpu, char* fname);
+
+extern "C"
+void gpuSetDihedralParameters(gpuContext gpu, const std::vector<int>& atom1, const std::vector<int>& atom2, const std::vector<int>& atom3, const std::vector<int>& atom4,
+        const std::vector<float>& k, const std::vector<float>& phase, const std::vector<int>& periodicity);
+
+extern "C"
+int gpuReadRbDihedralParameters(gpuContext gpu, char* fname);
+
+extern "C"
+void gpuSetRbDihedralParameters(gpuContext gpu, const std::vector<int>& atom1, const std::vector<int>& atom2, const std::vector<int>& atom3, const std::vector<int>& atom4,
+        const std::vector<float>& c0, const std::vector<float>& c1, const std::vector<float>& c2, const std::vector<float>& c3, const std::vector<float>& c4, const std::vector<float>& c5);
+
+extern "C"
+int gpuReadLJ14Parameters(gpuContext gpu, char* fname);
+
+extern "C"
+void gpuSetLJ14Parameters(gpuContext gpu, float epsfac, float fudge, const std::vector<int>& atom1, const std::vector<int>& atom2,
+        const std::vector<float>& c6, const std::vector<float>& c12, const std::vector<float>& q1, const std::vector<float>& q2);
+
+extern "C"
+float gpuGetAtomicRadius(gpuContext gpu, string s);
+
+extern "C"
+unsigned char gpuGetAtomicSymbol(gpuContext gpu, string s);
+
+extern "C"
+int gpuReadAtomicParameters(gpuContext gpu, char* fname);
+
+extern "C"
+int gpuReadCoulombParameters(gpuContext gpu, char* fname);
+
+extern "C"
+void gpuSetCoulombParameters(gpuContext gpu, float epsfac, const std::vector<int>& atom, const std::vector<float>& c6, const std::vector<float>& c12, const std::vector<float>& q,
+        const std::vector<char>& symbol, const std::vector<vector<int> >& exclusions);
+
+extern "C"
+void gpuSetObcParameters(gpuContext gpu, float innerDielectric, float solventDielectric, const std::vector<int>& atom, const std::vector<float>& radius, const std::vector<float>& scale);
+
+extern "C"
+int gpuReadShakeParameters(gpuContext gpu, char* fname);
+
+extern "C"
+void gpuSetShakeParameters(gpuContext gpu, const std::vector<int>& atom1, const std::vector<int>& atom2, const std::vector<float>& distance,
+        const std::vector<float>& invMass1, const std::vector<float>& invMass2, float tolerance);
+
+extern "C"
+int gpuAllocateInitialBuffers(gpuContext gpu);
+
+extern "C"
+void gpuReadCoordinates(gpuContext gpu, char* fname);
+
+extern "C"
+void gpuSetPositions(gpuContext gpu, const std::vector<float>& x, const std::vector<float>& y, const std::vector<float>& z);
+
+extern "C"
+void gpuSetVelocities(gpuContext gpu, const std::vector<float>& x, const std::vector<float>& y, const std::vector<float>& z);
+
+extern "C"
+void gpuSetMass(gpuContext gpu, const std::vector<float>& mass);
+
+extern "C"
+void gpuInitializeRandoms(gpuContext gpu);
+
+extern "C"
+void* gpuInitFromFile(char* fname);
+
+extern "C"
+void* gpuInit(int numAtoms);
+
+extern "C"
+void gpuSetIntegrationParameters(gpuContext gpu, float tau, float deltaT, float temperature);
+
+extern "C"
+void gpuSetVerletIntegrationParameters(gpuContext gpu, float deltaT);
+
+extern "C"
+void gpuSetBrownianIntegrationParameters(gpuContext gpu, float tau, float deltaT, float temperature);
+
+extern "C"
+void gpuSetAndersenThermostatParameters(gpuContext gpu, float temperature, float collisionProbability);
+
+extern "C"
+void gpuShutDown(gpuContext gpu);
+
+extern "C"
+int gpuBuildOutputBuffers(gpuContext gpu);
+
+extern "C"
+int gpuBuildThreadBlockWorkList(gpuContext gpu);
+
+extern "C"
+int gpuBuildExclusionList(gpuContext gpu);
+
+extern "C"
+int gpuSetConstants(gpuContext gpu);
+
+extern "C"
+void gpuDumpCoordinates(gpuContext gpu);
+
+extern "C"
+void gpuDumpPrimeCoordinates(gpuContext gpu);
+
+extern "C"
+void gpuDumpForces(gpuContext gpu);
+
+extern "C"
+void gpuDumpAtomData(gpuContext gpu);
+
+extern "C"
+bool gpuCheckData(gpuContext gpu);
+
+extern "C"
+void gpuSetup(void* pVoid);
+
+extern "C"
+void kCPUCalculate14(gpuContext gpu);
+
+extern "C"
+void kCPUCalculateLocalForces(gpuContext gpu);
+
+extern "C"
+void WriteArrayToFile1( gpuContext gpu, char* fname, int step, CUDAStream<float>*  psPos, int numPrint );
+
+extern "C"
+void WriteArrayToFile2( gpuContext gpu, char* fname, int step, CUDAStream<float2>* psPos, int numPrint );
+
+extern "C"
+void WriteArrayToFile3( gpuContext gpu, char* fname, int step, CUDAStream<float3>* psPos, int numPrint );
+
+extern "C"
+void WriteArrayToFile4( gpuContext gpu, char* fname, int step, CUDAStream<float4>* psPos, int numPrint );
+
+extern "C"
+void gpuDumpObcInfo(gpuContext gpu);
+
+extern "C"
+void gpuDumpObcLoop1(gpuContext gpu); 
+
+#endif //__GPUTYPES_H__
--- a/platforms/cuda/src/kernels/kBrownianUpdate.cu
+++ b/platforms/cuda/src/kernels/kBrownianUpdate.cu
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#include <cuda.h>
+#include <vector_functions.h>
+#include <cstdlib>
+#include <string>
+#include <iostream>
+//#include <fstream>
+using namespace std;
+
+#include "gputypes.h"
+
+#define DeltaShake
+
+static __constant__ cudaGmxSimulation cSim;
+
+void SetBrownianUpdateSim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
+}
+
+void GetBrownianUpdateSim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
+}
+
+__global__ void kBrownianUpdatePart1_kernel()
+{
+    unsigned int pos    = threadIdx.x + blockIdx.x * blockDim.x;
+    unsigned int rpos   = cSim.pRandomPosition[blockIdx.x];
+    __syncthreads();
+    
+    while (pos < cSim.atoms)
+    {
+        float4 random4a         = cSim.pRandom4a[rpos + pos];
+        float4 apos             = cSim.pPosq[pos];
+        float4 force            = cSim.pForce4[pos];
+
+        cSim.pOldPosq[pos]      = apos;
+#ifndef DeltaShake
+        apos.x                 += force.x*cSim.GDT + random4a.x;
+        apos.y                 += force.y*cSim.GDT + random4a.y;
+        apos.z                 += force.z*cSim.GDT + random4a.z;
+#else
+        apos.x                  = force.x*cSim.GDT + random4a.x;
+        apos.y                  = force.y*cSim.GDT + random4a.y;
+        apos.z                  = force.z*cSim.GDT + random4a.z;
+#endif
+        cSim.pPosqP[pos]        = apos;
+        pos                    += blockDim.x * gridDim.x;
+    }
+}
+
+void kBrownianUpdatePart1(gpuContext gpu)
+{
+//    printf("kBrownianUpdatePart1\n");
+    kBrownianUpdatePart1_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
+    LAUNCHERROR("kBrownianUpdatePart1");
+}
+
+__global__ void kBrownianUpdatePart2_kernel()
+{
+    unsigned int pos            = threadIdx.x + blockIdx.x * blockDim.x;
+    unsigned int rpos           = cSim.pRandomPosition[blockIdx.x];
+    __syncthreads();
+    
+    while (pos < cSim.atoms)
+    {
+        float4 velocity         = cSim.pVelm4[pos];
+        float4 apos             = cSim.pPosq[pos];
+        float4 xPrime           = cSim.pPosqP[pos];
+
+#ifndef DeltaShake
+        velocity.x              = cSim.oneOverDeltaT*(xPrime.x-apos.x);
+        velocity.y              = cSim.oneOverDeltaT*(xPrime.y-apos.y);
+        velocity.z              = cSim.oneOverDeltaT*(xPrime.z-apos.z);
+#else
+        velocity.x              = cSim.oneOverDeltaT*(xPrime.x);
+        velocity.y              = cSim.oneOverDeltaT*(xPrime.y);
+        velocity.z              = cSim.oneOverDeltaT*(xPrime.z);
+
+        xPrime.x               += apos.x;
+        xPrime.y               += apos.y;
+        xPrime.z               += apos.z;
+#endif
+        cSim.pPosq[pos]         = xPrime;
+        cSim.pVelm4[pos]        = velocity;
+         
+        pos                    += blockDim.x * gridDim.x;    
+    }
+
+    // Update random position pointer
+    if (threadIdx.x == 0)
+    {
+        rpos                   += cSim.paddedNumberOfAtoms;
+        if (rpos > cSim.randoms)
+            rpos               -= cSim.randoms;
+        cSim.pRandomPosition[blockIdx.x] = rpos;
+    }
+}
+
+extern void kGenerateRandoms(gpuContext gpu);
+void kBrownianUpdatePart2(gpuContext gpu)
+{
+//    printf("kBrownianUpdatePart2\n");
+    kBrownianUpdatePart2_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
+    LAUNCHERROR("kBrownianUpdatePart2");
+    
+    // Update randoms if necessary
+    static int iteration = 0;
+    iteration++;
+    if (iteration == gpu->sim.randomIterations)
+    {
+        kGenerateRandoms(gpu);
+        iteration = 0;
+    }
+}
+
--- a/platforms/cuda/src/kernels/kCalculateAndersenThermostat.cu
+++ b/platforms/cuda/src/kernels/kCalculateAndersenThermostat.cu
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#include <cuda.h>
+#include <vector_functions.h>
+#include <cstdlib>
+#include <string>
+#include <iostream>
+//#include <fstream>
+using namespace std;
+
+#include "gputypes.h"
+
+static __constant__ cudaGmxSimulation cSim;
+
+void SetCalculateAndersenThermostatSim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
+}
+
+void GetCalculateAndersenThermostatSim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
+}
+
+__global__ void kCalculateAndersenThermostat_kernel()
+{
+    unsigned int pos            = threadIdx.x + blockIdx.x * blockDim.x;
+    unsigned int rpos           = cSim.pRandomPosition[blockIdx.x];
+    __syncthreads();
+    
+    while (pos < cSim.atoms)
+    {
+        float4 velocity         = cSim.pVelm4[pos];
+        float4 random4a         = cSim.pRandom4a[rpos + pos];
+        float scale = (random4a.w < cSim.collisionProbability ? 0.0 : 1.0);
+        float add = (1.0-scale)*sqrt(cSim.kT*velocity.w);
+        velocity.x = scale*velocity.x + add*random4a.x;
+        velocity.y = scale*velocity.y + add*random4a.y;
+        velocity.z = scale*velocity.z + add*random4a.z;
+        cSim.pVelm4[pos]        = velocity;
+         
+        pos                    += blockDim.x * gridDim.x;    
+    }
+
+    // Update random position pointer
+    if (threadIdx.x == 0)
+    {
+        rpos                   += cSim.paddedNumberOfAtoms;
+        if (rpos > cSim.randoms)
+            rpos               -= cSim.randoms;
+        cSim.pRandomPosition[blockIdx.x] = rpos;
+    }
+}
+
+extern void kGenerateRandoms(gpuContext gpu);
+void kCalculateAndersenThermostat(gpuContext gpu)
+{
+//    printf("kCalculateAndersenThermostat\n");
+    kCalculateAndersenThermostat_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
+    LAUNCHERROR("kCalculateAndersenThermostat");
+    
+    // Update randoms if necessary
+    static int iteration = 0;
+    iteration++;
+    if (iteration == gpu->sim.randomIterations)
+    {
+        kGenerateRandoms(gpu);
+        iteration = 0;
+    }
+}
+
--- a/platforms/cuda/src/kernels/kCalculateCDLJForces.cu
+++ b/platforms/cuda/src/kernels/kCalculateCDLJForces.cu
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#include <cuda.h>
+#include <vector_functions.h>
+#include <cstdlib>
+#include <string>
+#include <iostream>
+#include <fstream>
+using namespace std;
+
+#include "gputypes.h"
+#include "cudatypes.h"
+
+#define UNROLLXX 0
+#define UNROLLXY 0
+
+struct Atom {
+    float x;
+    float y;
+    float z;
+    float q;
+    float sig;
+    float eps;
+    float fx;
+    float fy;
+    float fz;
+    float eps2;
+    float sig2;
+};
+
+
+__shared__ Atom sA[G8X_NONBOND_THREADS_PER_BLOCK];
+__shared__ unsigned int sWorkUnit[G8X_NONBOND_WORKUNITS_PER_SM];
+__shared__ unsigned int sNext[GRID];
+
+static __constant__ cudaGmxSimulation cSim;
+
+void SetCalculateCDLJForcesSim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
+}
+
+void GetCalculateCDLJForcesSim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
+}
+
+__global__ void kCalculateCDLJForces_kernel()
+{
+    // Read queue of work blocks once so the remainder of
+    // kernel can run asynchronously    
+    int pos = cSim.nbWorkUnitsPerBlock * blockIdx.x + min(blockIdx.x, cSim.nbWorkUnitsPerBlockRemainder);
+    int end = cSim.nbWorkUnitsPerBlock * (blockIdx.x + 1) + min((blockIdx.x + 1), cSim.nbWorkUnitsPerBlockRemainder);    
+    if (threadIdx.x < end - pos)
+    {
+        sWorkUnit[threadIdx.x] = cSim.pWorkUnit[pos + threadIdx.x];
+    }
+    if (threadIdx.x < GRID)
+    {
+        sNext[threadIdx.x] = (threadIdx.x + 1) & (GRID - 1);
+    }
+    __syncthreads();
+
+    // Now change pos and end to reflect work queue just read
+    // into shared memory
+    end = end - pos; 
+    pos = end - (threadIdx.x >> GRIDBITS) - 1;
+       
+    while (pos >= 0)
+    {  
+    
+        // Extract cell coordinates from appropriate work unit
+        unsigned int x = sWorkUnit[pos];
+        unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
+        bool bExclusionFlag = (x & 0x1);
+        x = (x >> 17) << GRIDBITS;
+        float4      apos;   // Local atom x, y, z, q
+        float3      af;     // Local atom fx, fy, fz
+        float dx; 
+        float dy; 
+        float dz; 
+        float r2; 
+        float invR; 
+        float sig; 
+        float sig2; 
+        float sig6; 
+        float eps; 
+        float dEdR;  
+        unsigned int tgx = threadIdx.x & (GRID - 1);
+        unsigned int tbx = threadIdx.x - tgx;
+        int tj = tgx; 
+        Atom* psA = &sA[tbx];
+        if (!bExclusionFlag)
+        {
+            if (x == y) // Handle diagonals uniquely at 50% efficiency
+            { 
+                // Read fixed atom data into registers and GRF
+                unsigned int i      = x + tgx;
+                apos                = cSim.pPosq[i];
+                float2 a            = cSim.pAttr[i];
+                sA[threadIdx.x].x   = apos.x;
+                sA[threadIdx.x].y   = apos.y;
+                sA[threadIdx.x].z   = apos.z;
+                sA[threadIdx.x].q   = apos.w;
+                sA[threadIdx.x].sig = a.x;
+                sA[threadIdx.x].eps = a.y;
+                af.x                = 0.0f;
+                af.y                = 0.0f;
+                af.z                = 0.0f;
+                apos.w             *= cSim.epsfac;
+                for (unsigned int j = 0; j < GRID; j++)
+                {
+                    dx              = psA[j].x - apos.x; 
+                    dy              = psA[j].y - apos.y; 
+                    dz              = psA[j].z - apos.z; 
+                    r2              = dx * dx + dy * dy + dz * dz; 
+                    invR            = 1.0f / sqrt(r2);
+                    sig             = a.x + psA[j].sig; 
+                    sig2            = invR * sig; 
+                    sig2           *= sig2;
+                    sig6            = sig2 * sig2 * sig2; 
+                    eps             = a.y * psA[j].eps; 
+                    dEdR            = eps * (12.0f * sig6 - 6.0f) * sig6; 
+                    dEdR           += apos.w * psA[j].q * invR; 
+                    dEdR           *= invR * invR; 
+                    dx             *= dEdR; 
+                    dy             *= dEdR; 
+                    dz             *= dEdR; 
+                    af.x           -= dx; 
+                    af.y           -= dy; 
+                    af.z           -= dz; 
+                }
+                
+                // Write results
+                float4 of;
+                of.x                                = af.x;
+                of.y                                = af.y;
+                of.z                                = af.z;
+                of.w                                = 0.0f;
+                int offset                          = x + tgx + (x >> GRIDBITS) * cSim.stride;
+                cSim.pForce4a[offset]               = of;
+            }         
+            else        // 100% utilization
+            {
+                // Read fixed atom data into registers and GRF
+                int j                   = y + tgx;
+                unsigned int i          = x + tgx;
+                float4 temp             = cSim.pPosq[j];
+                float2 temp1            = cSim.pAttr[j];
+                apos                    = cSim.pPosq[i];
+                float2 a                = cSim.pAttr[i];
+                sA[threadIdx.x].x       = temp.x;
+                sA[threadIdx.x].y       = temp.y;
+                sA[threadIdx.x].z       = temp.z;
+                sA[threadIdx.x].q       = temp.w;
+                sA[threadIdx.x].sig     = temp1.x;
+                sA[threadIdx.x].eps     = temp1.y;
+                sA[threadIdx.x].fx      = af.x = 0.0f;
+                sA[threadIdx.x].fy      = af.y = 0.0f;
+                sA[threadIdx.x].fz      = af.z = 0.0f;
+                sA[threadIdx.x].sig2    = a.x;
+                sA[threadIdx.x].eps2    = a.y;
+                apos.w                 *= cSim.epsfac;
+                
+                for (j = 0; j < GRID; j++)
+                {
+                    dx              = psA[tj].x - apos.x; 
+                    dy              = psA[tj].y - apos.y; 
+                    dz              = psA[tj].z - apos.z; 
+                    r2              = dx * dx + dy * dy + dz * dz; 
+                    invR            = 1.0f / sqrt(r2);
+                    sig             = a.x + psA[tj].sig; 
+                    sig2            = invR * sig; 
+                    sig2           *= sig2;
+                    sig6            = sig2 * sig2 * sig2; 
+                    eps             = a.y * psA[tj].eps; 
+                    dEdR            = eps * (12.0f * sig6 - 6.0f) * sig6; 
+                    dEdR           += apos.w * psA[tj].q * invR; 
+                    dEdR           *= invR * invR; 
+                    dx             *= dEdR; 
+                    dy             *= dEdR; 
+                    dz             *= dEdR; 
+                    af.x           -= dx; 
+                    af.y           -= dy; 
+                    af.z           -= dz; 
+                    psA[tj].fx     += dx; 
+                    psA[tj].fy     += dy; 
+                    psA[tj].fz     += dz;
+                    tj              = sNext[tj]; 
+                }
+                
+                // Write results
+                float4 of;
+                of.x                                = af.x;
+                of.y                                = af.y;
+                of.z                                = af.z;
+                of.w                                = 0.0f;
+                int offset                          = x + tgx + (y >> GRIDBITS) * cSim.stride;
+                cSim.pForce4a[offset]               = of;
+                of.x                                = sA[threadIdx.x].fx;
+                of.y                                = sA[threadIdx.x].fy;
+                of.z                                = sA[threadIdx.x].fz;
+                offset                              = y + tgx + (x >> GRIDBITS) * cSim.stride;
+                cSim.pForce4a[offset]               = of;
+            }
+        }
+        else  // bExclusion
+        {
+            // Read exclusion data
+            
+            if (x == y) // Handle diagonals uniquely at 50% efficiency
+            { 
+                // Read fixed atom data into registers and GRF
+                unsigned int excl       = cSim.pExclusion[x * cSim.exclusionStride + y + tgx];                          
+                unsigned int i          = x + tgx;
+                apos                    = cSim.pPosq[i];
+                float2 a                = cSim.pAttr[i];
+                sA[threadIdx.x].x       = apos.x;
+                sA[threadIdx.x].y       = apos.y;
+                sA[threadIdx.x].z       = apos.z;
+                sA[threadIdx.x].q       = apos.w;
+                sA[threadIdx.x].sig     = a.x;
+                sA[threadIdx.x].eps     = a.y;
+                af.x                    = 0.0f;
+                af.y                    = 0.0f;
+                af.z                    = 0.0f;
+                sA[threadIdx.x].sig2    = a.x;
+                sA[threadIdx.x].eps2    = a.y;
+                apos.w                 *= cSim.epsfac;
+                
+                for (unsigned int j = 0; j < GRID; j++)
+                {
+                    dx              = psA[j].x - apos.x; 
+                    dy              = psA[j].y - apos.y; 
+                    dz              = psA[j].z - apos.z; 
+                    r2              = dx * dx + dy * dy + dz * dz; 
+                    invR            = 1.0f / sqrt(r2);
+                    sig             = psA[tgx].sig2 + psA[j].sig; 
+                    sig2            = invR * sig; 
+                    sig2           *= sig2;
+                    sig6            = sig2 * sig2 * sig2; 
+                    eps             = psA[tgx].eps2 * psA[j].eps; 
+                    dEdR            = eps * (12.0f * sig6 - 6.0f) * sig6; 
+                    dEdR           += apos.w * psA[j].q * invR; 
+                    dEdR           *= invR * invR; 
+                    if (!(excl & 0x1))
+                    {
+                        dEdR = 0.0f;
+                    }
+                    dx             *= dEdR; 
+                    dy             *= dEdR; 
+                    dz             *= dEdR; 
+                    af.x           -= dx; 
+                    af.y           -= dy; 
+                    af.z           -= dz;
+                    excl          >>= 1;               
+                }
+                
+                // Write results
+                float4 of;
+                of.x                                = af.x;
+                of.y                                = af.y;
+                of.z                                = af.z;
+                of.w                                = 0.0f;
+                int offset                          = x + tgx + (x >> GRIDBITS) * cSim.stride;
+                cSim.pForce4a[offset]               = of;
+            }         
+            else        // 100% utilization
+            {
+                // Read fixed atom data into registers and GRF        
+                unsigned int excl       = cSim.pExclusion[x * cSim.exclusionStride + y + tgx];
+                excl                    = (excl >> tgx) | (excl << (GRID - tgx));
+                int j                   = y + tgx;
+                unsigned int i          = x + tgx;
+                float4 temp             = cSim.pPosq[j];
+                float2 temp1            = cSim.pAttr[j];
+                apos                    = cSim.pPosq[i];
+                float2 a                = cSim.pAttr[i];
+                sA[threadIdx.x].x       = temp.x;
+                sA[threadIdx.x].y       = temp.y;
+                sA[threadIdx.x].z       = temp.z;
+                sA[threadIdx.x].q       = temp.w;
+                sA[threadIdx.x].sig     = temp1.x;
+                sA[threadIdx.x].eps     = temp1.y;
+                sA[threadIdx.x].fx      = af.x = 0.0f;
+                sA[threadIdx.x].fy      = af.y = 0.0f;
+                sA[threadIdx.x].fz      = af.z = 0.0f;
+                sA[threadIdx.x].sig2    = a.x;
+                sA[threadIdx.x].eps2    = a.y;
+                apos.w                 *= cSim.epsfac;
+                
+                for (j = 0; j < GRID; j++)
+                {
+                    dx              = psA[tj].x - apos.x; 
+                    dy              = psA[tj].y - apos.y; 
+                    dz              = psA[tj].z - apos.z; 
+                    r2              = dx * dx + dy * dy + dz * dz; 
+                    invR            = 1.0f / sqrt(r2);
+                    sig             = psA[tgx].sig2 + psA[tj].sig; 
+                    sig2            = invR * sig; 
+                    sig2           *= sig2;
+                    sig6            = sig2 * sig2 * sig2; 
+                    eps             = psA[tgx].eps2 * psA[tj].eps; 
+                    dEdR            = eps * (12.0f * sig6 - 6.0f) * sig6; 
+                    dEdR           += apos.w * psA[tj].q * invR; 
+                    dEdR           *= invR * invR; 
+                    if (!(excl & 0x1))
+                    {
+                        dEdR = 0.0f;
+                    }
+                    dx             *= dEdR; 
+                    dy             *= dEdR; 
+                    dz             *= dEdR; 
+                    af.x           -= dx; 
+                    af.y           -= dy; 
+                    af.z           -= dz; 
+                    psA[tj].fx     += dx; 
+                    psA[tj].fy     += dy; 
+                    psA[tj].fz     += dz;
+                    excl          >>= 1;
+                    tj              = sNext[tj]; 
+                }
+                
+                // Write results
+                float4 of;
+                of.x                                = af.x;
+                of.y                                = af.y;
+                of.z                                = af.z;
+                of.w                                = 0.0f;
+                int offset                          = x + tgx + (y >> GRIDBITS) * cSim.stride;
+                cSim.pForce4a[offset]               = of;
+                of.x                                = sA[threadIdx.x].fx;
+                of.y                                = sA[threadIdx.x].fy;
+                of.z                                = sA[threadIdx.x].fz;
+                offset                              = y + tgx + (x >> GRIDBITS) * cSim.stride;
+                cSim.pForce4a[offset]               = of;
+            }
+        }
+
+        pos -= cSim.nonbond_workBlock;     
+    }
+}
+
+__global__ extern void kCalculateCDLJForces_12_kernel();
+
+void kCalculateCDLJForces(gpuContext gpu)
+{
+//    printf("kCalculateCDLJForces\n");
+    if (gpu->sm_version < SM_12)
+        kCalculateCDLJForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
+    else
+        kCalculateCDLJForces_12_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
+    LAUNCHERROR("kCalculateCDLJForces");
+}
\ No newline at end of file
--- a/platforms/cuda/src/kernels/kCalculateCDLJForces_12.cu
+++ b/platforms/cuda/src/kernels/kCalculateCDLJForces_12.cu
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#include <cuda.h>
+#include <vector_functions.h>
+#include <cstdlib>
+#include <string>
+#include <iostream>
+#include <fstream>
+using namespace std;
+
+#include "gputypes.h"
+#include "cudatypes.h"
+
+#define UNROLLXX 0
+#define UNROLLXY 0
+
+struct Atom {
+    float x;
+    float y;
+    float z;
+    float q;
+    float sig;
+    float eps;
+    float fx;
+    float fy;
+    float fz;
+};
+
+
+__shared__ Atom sA[GT2XX_NONBOND_THREADS_PER_BLOCK];
+__shared__ unsigned int sWorkUnit[GT2XX_NONBOND_WORKUNITS_PER_SM];
+__shared__ unsigned int sNext[GRID];
+
+static __constant__ cudaGmxSimulation cSim;
+
+void SetCalculateCDLJForces_12Sim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
+}
+
+void GetCalculateCDLJForces_12Sim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
+}
+
+__global__ void kCalculateCDLJForces_12_kernel()
+{
+    // Read queue of work blocks once so the remainder of
+    // kernel can run asynchronously    
+    int pos = cSim.nbWorkUnitsPerBlock * blockIdx.x + min(blockIdx.x, cSim.nbWorkUnitsPerBlockRemainder);
+    int end = cSim.nbWorkUnitsPerBlock * (blockIdx.x + 1) + min((blockIdx.x + 1), cSim.nbWorkUnitsPerBlockRemainder);    
+    if (threadIdx.x < end - pos)
+    {
+        sWorkUnit[threadIdx.x] = cSim.pWorkUnit[pos + threadIdx.x];
+    }
+    if (threadIdx.x < GRID)
+    {
+        sNext[threadIdx.x] = (threadIdx.x + 1) & (GRID - 1);
+    }
+    __syncthreads();
+
+    // Now change pos and end to reflect work queue just read
+    // into shared memory
+    end = end - pos; 
+    pos = end - (threadIdx.x >> GRIDBITS) - 1;
+       
+    while (pos >= 0)
+    {  
+    
+        // Extract cell coordinates from appropriate work unit
+        unsigned int x = sWorkUnit[pos];
+        unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
+        bool bExclusionFlag = (x & 0x1);
+        x = (x >> 17) << GRIDBITS;
+        float4      apos;   // Local atom x, y, z, q
+        float3      af;     // Local atom fx, fy, fz
+        float dx; 
+        float dy; 
+        float dz; 
+        float r2; 
+        float invR; 
+        float sig; 
+        float sig2; 
+        float sig6; 
+        float eps; 
+        float dEdR;  
+        unsigned int tgx = threadIdx.x & (GRID - 1);
+        unsigned int tbx = threadIdx.x - tgx;
+        int tj = tgx; 
+        Atom* psA = &sA[tbx];
+        if (!bExclusionFlag)
+        {
+            if (x == y) // Handle diagonals uniquely at 50% efficiency
+            { 
+                // Read fixed atom data into registers and GRF
+                unsigned int i      = x + tgx;
+                apos                = cSim.pPosq[i];
+                float2 a            = cSim.pAttr[i];
+                sA[threadIdx.x].x   = apos.x;
+                sA[threadIdx.x].y   = apos.y;
+                sA[threadIdx.x].z   = apos.z;
+                sA[threadIdx.x].q   = apos.w;
+                sA[threadIdx.x].sig = a.x;
+                sA[threadIdx.x].eps = a.y;
+                af.x                = 0.0f;
+                af.y                = 0.0f;
+                af.z                = 0.0f;
+                apos.w             *= cSim.epsfac;
+                for (unsigned int j = 0; j < GRID; j++)
+                {
+                    dx              = psA[j].x - apos.x; 
+                    dy              = psA[j].y - apos.y; 
+                    dz              = psA[j].z - apos.z; 
+                    r2              = dx * dx + dy * dy + dz * dz; 
+                    invR            = 1.0f / sqrt(r2);
+                    sig             = a.x + psA[j].sig; 
+                    sig2            = invR * sig; 
+                    sig2           *= sig2;
+                    sig6            = sig2 * sig2 * sig2; 
+                    eps             = a.y * psA[j].eps; 
+                    dEdR            = eps * (12.0f * sig6 - 6.0f) * sig6; 
+                    dEdR           += apos.w * psA[j].q * invR; 
+                    dEdR           *= invR * invR; 
+                    dx             *= dEdR; 
+                    dy             *= dEdR; 
+                    dz             *= dEdR; 
+                    af.x           -= dx; 
+                    af.y           -= dy; 
+                    af.z           -= dz; 
+                }
+                
+                // Write results
+                float4 of;
+                of.x                                = af.x;
+                of.y                                = af.y;
+                of.z                                = af.z;
+                of.w                                = 0.0f;
+                int offset                          = x + tgx + (x >> GRIDBITS) * cSim.stride;
+                cSim.pForce4a[offset]               = of;
+            }         
+            else        // 100% utilization
+            {
+                // Read fixed atom data into registers and GRF
+                int j                   = y + tgx;
+                unsigned int i          = x + tgx;
+                float4 temp             = cSim.pPosq[j];
+                float2 temp1            = cSim.pAttr[j];
+                apos                    = cSim.pPosq[i];
+                float2 a                = cSim.pAttr[i];
+                sA[threadIdx.x].x       = temp.x;
+                sA[threadIdx.x].y       = temp.y;
+                sA[threadIdx.x].z       = temp.z;
+                sA[threadIdx.x].q       = temp.w;
+                sA[threadIdx.x].sig     = temp1.x;
+                sA[threadIdx.x].eps     = temp1.y;
+                sA[threadIdx.x].fx      = af.x = 0.0f;
+                sA[threadIdx.x].fy      = af.y = 0.0f;
+                sA[threadIdx.x].fz      = af.z = 0.0f;
+                apos.w                 *= cSim.epsfac;
+                
+                for (j = 0; j < GRID; j++)
+                {
+                    dx              = psA[tj].x - apos.x; 
+                    dy              = psA[tj].y - apos.y; 
+                    dz              = psA[tj].z - apos.z; 
+                    r2              = dx * dx + dy * dy + dz * dz; 
+                    invR            = 1.0f / sqrt(r2);
+                    sig             = a.x + psA[tj].sig; 
+                    sig2            = invR * sig; 
+                    sig2           *= sig2;
+                    sig6            = sig2 * sig2 * sig2; 
+                    eps             = a.y * psA[tj].eps; 
+                    dEdR            = eps * (12.0f * sig6 - 6.0f) * sig6; 
+                    dEdR           += apos.w * psA[tj].q * invR; 
+                    dEdR           *= invR * invR; 
+                    dx             *= dEdR; 
+                    dy             *= dEdR; 
+                    dz             *= dEdR; 
+                    af.x           -= dx; 
+                    af.y           -= dy; 
+                    af.z           -= dz; 
+                    psA[tj].fx     += dx; 
+                    psA[tj].fy     += dy; 
+                    psA[tj].fz     += dz;
+                    tj              = sNext[tj]; 
+                }
+                
+                // Write results
+                float4 of;
+                of.x                                = af.x;
+                of.y                                = af.y;
+                of.z                                = af.z;
+                of.w                                = 0.0f;
+                int offset                          = x + tgx + (y >> GRIDBITS) * cSim.stride;
+                cSim.pForce4a[offset]               = of;
+                of.x                                = sA[threadIdx.x].fx;
+                of.y                                = sA[threadIdx.x].fy;
+                of.z                                = sA[threadIdx.x].fz;
+                offset                              = y + tgx + (x >> GRIDBITS) * cSim.stride;
+                cSim.pForce4a[offset]               = of;
+            }
+        }
+        else  // bExclusion
+        {
+            // Read exclusion data
+            
+            if (x == y) // Handle diagonals uniquely at 50% efficiency
+            { 
+                // Read fixed atom data into registers and GRF
+                unsigned int excl       = cSim.pExclusion[x * cSim.exclusionStride + y + tgx];                          
+                unsigned int i          = x + tgx;
+                apos                    = cSim.pPosq[i];
+                float2 a                = cSim.pAttr[i];
+                sA[threadIdx.x].x       = apos.x;
+                sA[threadIdx.x].y       = apos.y;
+                sA[threadIdx.x].z       = apos.z;
+                sA[threadIdx.x].q       = apos.w;
+                sA[threadIdx.x].sig     = a.x;
+                sA[threadIdx.x].eps     = a.y;
+                af.x                    = 0.0f;
+                af.y                    = 0.0f;
+                af.z                    = 0.0f;
+                apos.w                 *= cSim.epsfac;
+                
+                for (unsigned int j = 0; j < GRID; j++)
+                {
+                    dx              = psA[j].x - apos.x; 
+                    dy              = psA[j].y - apos.y; 
+                    dz              = psA[j].z - apos.z; 
+                    r2              = dx * dx + dy * dy + dz * dz; 
+                    invR            = 1.0f / sqrt(r2);
+                    sig             = a.x + psA[j].sig; 
+                    sig2            = invR * sig; 
+                    sig2           *= sig2;
+                    sig6            = sig2 * sig2 * sig2; 
+                    eps             = a.y * psA[j].eps; 
+                    dEdR            = eps * (12.0f * sig6 - 6.0f) * sig6; 
+                    dEdR           += apos.w * psA[j].q * invR; 
+                    dEdR           *= invR * invR; 
+                    if (!(excl & 0x1))
+                    {
+                        dEdR = 0.0f;
+                    }
+                    dx             *= dEdR; 
+                    dy             *= dEdR; 
+                    dz             *= dEdR; 
+                    af.x           -= dx; 
+                    af.y           -= dy; 
+                    af.z           -= dz;
+                    excl          >>= 1;               
+                }
+                
+                // Write results
+                float4 of;
+                of.x                                = af.x;
+                of.y                                = af.y;
+                of.z                                = af.z;
+                of.w                                = 0.0f;
+                int offset                          = x + tgx + (x >> GRIDBITS) * cSim.stride;
+                cSim.pForce4a[offset]               = of;
+            }         
+            else        // 100% utilization
+            {
+                // Read fixed atom data into registers and GRF        
+                unsigned int excl       = cSim.pExclusion[x * cSim.exclusionStride + y + tgx];
+                excl                    = (excl >> tgx) | (excl << (GRID - tgx));
+                int j                   = y + tgx;
+                unsigned int i          = x + tgx;
+                float4 temp             = cSim.pPosq[j];
+                float2 temp1            = cSim.pAttr[j];
+                apos                    = cSim.pPosq[i];
+                float2 a                = cSim.pAttr[i];
+                sA[threadIdx.x].x       = temp.x;
+                sA[threadIdx.x].y       = temp.y;
+                sA[threadIdx.x].z       = temp.z;
+                sA[threadIdx.x].q       = temp.w;
+                sA[threadIdx.x].sig     = temp1.x;
+                sA[threadIdx.x].eps     = temp1.y;
+                sA[threadIdx.x].fx      = af.x = 0.0f;
+                sA[threadIdx.x].fy      = af.y = 0.0f;
+                sA[threadIdx.x].fz      = af.z = 0.0f;
+                apos.w                 *= cSim.epsfac;
+                
+                for (j = 0; j < GRID; j++)
+                {
+                    dx              = psA[tj].x - apos.x; 
+                    dy              = psA[tj].y - apos.y; 
+                    dz              = psA[tj].z - apos.z; 
+                    r2              = dx * dx + dy * dy + dz * dz; 
+                    invR            = 1.0f / sqrt(r2);
+                    sig             = a.x + psA[tj].sig; 
+                    sig2            = invR * sig; 
+                    sig2           *= sig2;
+                    sig6            = sig2 * sig2 * sig2; 
+                    eps             = a.y * psA[tj].eps; 
+                    dEdR            = eps * (12.0f * sig6 - 6.0f) * sig6; 
+                    dEdR           += apos.w * psA[tj].q * invR; 
+                    dEdR           *= invR * invR; 
+                    if (!(excl & 0x1))
+                    {
+                        dEdR = 0.0f;
+                    }
+                    dx             *= dEdR; 
+                    dy             *= dEdR; 
+                    dz             *= dEdR; 
+                    af.x           -= dx; 
+                    af.y           -= dy; 
+                    af.z           -= dz; 
+                    psA[tj].fx     += dx; 
+                    psA[tj].fy     += dy; 
+                    psA[tj].fz     += dz;
+                    excl          >>= 1;
+                    tj              = sNext[tj]; 
+                }
+                
+                // Write results
+                float4 of;
+                of.x                                = af.x;
+                of.y                                = af.y;
+                of.z                                = af.z;
+                of.w                                = 0.0f;
+                int offset                          = x + tgx + (y >> GRIDBITS) * cSim.stride;
+                cSim.pForce4a[offset]               = of;
+                of.x                                = sA[threadIdx.x].fx;
+                of.y                                = sA[threadIdx.x].fy;
+                of.z                                = sA[threadIdx.x].fz;
+                offset                              = y + tgx + (x >> GRIDBITS) * cSim.stride;
+                cSim.pForce4a[offset]               = of;
+            }
+        }
+
+        pos -= cSim.nonbond_workBlock;     
+    }
+}
+
+void kCalculateCDLJForces_12(gpuContext gpu)
+{
+//    printf("kCalculateCDLJForces_12\n");
+    kCalculateCDLJForces_12_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
+    LAUNCHERROR("kCalculateCDLJForces_12");
+}
--- a/platforms/cuda/src/kernels/kCalculateCDLJObcGbsaForces1.cu
+++ b/platforms/cuda/src/kernels/kCalculateCDLJObcGbsaForces1.cu
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#include <cuda.h>
+#include <vector_functions.h>
+#include <cstdlib>
+#include <string>
+#include <iostream>
+#include <fstream>
+using namespace std;
+
+#include "gputypes.h"
+#include "cudatypes.h"
+#include "cudaKernels.h"
+
+struct Atom {
+    float x;
+    float y;
+    float z;
+    float q;
+    float sig;
+    float eps;
+    float br;
+    float fx;
+    float fy;
+    float fz;
+    float fb;
+    float q2;
+    float junk;
+};
+
+
+__shared__ Atom sA[G8X_NONBOND_THREADS_PER_BLOCK];
+__shared__ unsigned int sWorkUnit[G8X_NONBOND_WORKUNITS_PER_SM];
+__shared__ unsigned int sNext[GRID];
+
+static __constant__ cudaGmxSimulation cSim;
+
+void SetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
+}
+
+void GetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
+}
+
+__global__ void kCalculateCDLJObcGbsaForces1_kernel()
+{
+    // Read queue of work blocks once so the remainder of
+    // kernel can run asynchronously    
+    int pos = cSim.nbWorkUnitsPerBlock * blockIdx.x + min(blockIdx.x, cSim.nbWorkUnitsPerBlockRemainder);
+    int end = cSim.nbWorkUnitsPerBlock * (blockIdx.x + 1) + min((blockIdx.x + 1), cSim.nbWorkUnitsPerBlockRemainder);    
+    if (threadIdx.x < end - pos)
+    {
+        sWorkUnit[threadIdx.x] = cSim.pWorkUnit[pos + threadIdx.x];
+    }
+    if (threadIdx.x < GRID)
+    {
+        sNext[threadIdx.x]                  = (threadIdx.x + 1) & (GRID - 1);
+    }
+    __syncthreads();
+
+    // Now change pos and end to reflect work queue just read
+    // into shared memory
+    end                                     = end - pos; 
+    pos                                     = end - (threadIdx.x >> GRIDBITS) - 1;
+       
+    while (pos >= 0)
+    {  
+    
+        // Extract cell coordinates from appropriate work unit
+        unsigned int x                      = sWorkUnit[pos];
+        unsigned int y                      = ((x >> 2) & 0x7fff) << GRIDBITS;
+        bool bExclusionFlag                 = (x & 0x1);
+        x                                   = (x >> 17) << GRIDBITS;
+        unsigned int tgx                    = threadIdx.x & (GRID - 1);
+        unsigned int i                      = x + tgx;
+        float4 apos                         = cSim.pPosq[i];
+        float2 a                            = cSim.pAttr[i];
+        float br                            = cSim.pBornRadii[i];        
+        unsigned int tbx                    = threadIdx.x - tgx;
+        int tj                              = tgx; 
+        Atom* psA                           = &sA[tbx];
+        if (!bExclusionFlag)
+        {
+            if (x == y) // Handle diagonals uniquely at 50% efficiency
+            { 
+                // Read fixed atom data into registers and GRF
+                sA[threadIdx.x].x           = apos.x;
+                sA[threadIdx.x].y           = apos.y;
+                sA[threadIdx.x].z           = apos.z;
+                sA[threadIdx.x].q           = cSim.epsfac * apos.w;
+                sA[threadIdx.x].q2          = cSim.preFactor * apos.w;
+                sA[threadIdx.x].sig         = a.x;
+                sA[threadIdx.x].eps         = a.y;
+                sA[threadIdx.x].br          = br; 
+                float4 af;
+                af.x                        = 0.0f;
+                af.y                        = 0.0f;
+                af.z                        = 0.0f;
+                af.w                        = 0.0f;
+                for (unsigned int j = 0; j < GRID; j++)
+                {
+                    float dx                = psA[j].x - apos.x; 
+                    float dy                = psA[j].y - apos.y; 
+                    float dz                = psA[j].z - apos.z; 
+                    float r2                = dx * dx + dy * dy + dz * dz; 
+                                     
+                    // CDLJ part
+						  
+                    float invR              = 1.0f / sqrt(r2);
+                    float sig               = a.x + psA[j].sig; 
+                    float sig2              = invR * sig; 
+                    sig2                   *= sig2;
+                    float sig6              = sig2 * sig2 * sig2; 
+                    float eps               = a.y * psA[j].eps; 
+                    float dEdR              = eps * (12.0f * sig6 - 6.0f) * sig6; 
+                    dEdR                   += apos.w * psA[j].q * invR; 
+                    dEdR                   *= invR * invR; 
+						 
+//float dEdR = 0.0f;
+
+                    // ObcGbsaForce1 part
+                    float alpha2_ij         = br * psA[j].br;
+                    float D_ij              = r2 / (4.0f * alpha2_ij);
+                    float expTerm           = exp(-D_ij);
+                    float denominator2      = r2 + alpha2_ij * expTerm;
+                    float denominator       = sqrt(denominator2);
+                    float Gpol              = (apos.w * psA[j].q2) / (denominator * denominator2);
+                    float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij); 
+                    af.w                   += dGpol_dalpha2_ij * psA[j].br;   
+                    dEdR                   += Gpol * (1.0f - 0.25f * expTerm); 
+                    
+                    // Add Forces
+                    dx                     *= dEdR; 
+                    dy                     *= dEdR; 
+                    dz                     *= dEdR; 
+                    af.x                   -= dx; 
+                    af.y                   -= dy; 
+                    af.z                   -= dz; 
+                }
+                
+                // Write results
+                int offset                  = x + tgx + (x >> GRIDBITS) * cSim.stride;
+                cSim.pForce4a[offset]       = af;
+                cSim.pBornForce[offset]     = af.w;
+            }         
+            else        // 100% utilization
+            {
+                // Read fixed atom data into registers and GRF
+                int j                       = y + tgx;
+                float4 temp                 = cSim.pPosq[j];
+                float2 temp1                = cSim.pAttr[j];
+                sA[threadIdx.x].br          = cSim.pBornRadii[j];
+                float4 af;
+                sA[threadIdx.x].fx          = af.x = 0.0f;
+                sA[threadIdx.x].fy          = af.y = 0.0f;
+                sA[threadIdx.x].fz          = af.z = 0.0f;
+                sA[threadIdx.x].fb          = af.w = 0.0f;      
+                sA[threadIdx.x].x           = temp.x;
+                sA[threadIdx.x].y           = temp.y;
+                sA[threadIdx.x].z           = temp.z;
+                sA[threadIdx.x].q           = cSim.epsfac * temp.w;
+                sA[threadIdx.x].q2          = cSim.preFactor * temp.w;
+                sA[threadIdx.x].sig         = temp1.x;
+                sA[threadIdx.x].eps         = temp1.y;
+   
+                for (j = 0; j < GRID; j++)
+                {
+                    float dx                = psA[tj].x - apos.x; 
+                    float dy                = psA[tj].y - apos.y; 
+                    float dz                = psA[tj].z - apos.z; 
+                    float r2                = dx * dx + dy * dy + dz * dz; 
+                    
+                    // CDLJ part
+						  
+                    float invR              = 1.0f / sqrt(r2);
+                    float sig               = a.x + psA[tj].sig; 
+                    float sig2              = invR * sig; 
+                    sig2                   *= sig2;
+                    float sig6              = sig2 * sig2 * sig2; 
+                    float eps               = a.y * psA[tj].eps; 
+                    float dEdR              = eps * (12.0f * sig6 - 6.0f) * sig6; 
+                    dEdR                   += apos.w * psA[tj].q * invR; 
+                    dEdR                   *= invR * invR; 
+                  
+//float dEdR = 0.0f;
+                    // ObcGbsaForce1 part
+                    float alpha2_ij         = br * psA[tj].br;
+                    float D_ij              = r2 / (4.0f * alpha2_ij);
+                    float expTerm           = exp(-D_ij);
+                    float denominator2      = r2 + alpha2_ij * expTerm;
+                    float denominator       = sqrt(denominator2);
+                    float Gpol              = (apos.w * psA[tj].q2) / (denominator * denominator2);
+                    float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij); 
+                    af.w                   += dGpol_dalpha2_ij * psA[tj].br;  
+                    psA[tj].fb             += dGpol_dalpha2_ij * br;      
+                    dEdR                   += Gpol * (1.0f - 0.25f * expTerm); 
+                    
+                    // Add forces
+                    dx                     *= dEdR; 
+                    dy                     *= dEdR; 
+                    dz                     *= dEdR; 
+                    af.x                   -= dx; 
+                    af.y                   -= dy; 
+                    af.z                   -= dz;    
+                    psA[tj].fx             += dx; 
+                    psA[tj].fy             += dy; 
+                    psA[tj].fz             += dz;
+                    tj                      = sNext[tj]; 
+                }
+                
+                // Write results
+                int offset                  = x + tgx + (y >> GRIDBITS) * cSim.stride;
+                cSim.pForce4a[offset]       = af;
+                cSim.pBornForce[offset]     = af.w;
+                af.x                        = sA[threadIdx.x].fx;
+                af.y                        = sA[threadIdx.x].fy;
+                af.z                        = sA[threadIdx.x].fz;
+                offset                      = y + tgx + (x >> GRIDBITS) * cSim.stride;
+                cSim.pForce4a[offset]       = af;
+                cSim.pBornForce[offset]     = sA[threadIdx.x].fb;
+            }
+        }
+        else  // bExclusion
+        {
+            // Read exclusion data
+            
+            if (x == y) // Handle diagonals uniquely at 50% efficiency
+            { 
+                // Read fixed atom data into registers and GRF
+                unsigned int excl           = cSim.pExclusion[x * cSim.exclusionStride + y + tgx];
+                float4 af;
+                af.x                        = 0.0f;
+                af.y                        = 0.0f;
+                af.z                        = 0.0f;
+                af.w                        = 0.0f;                                      
+                sA[threadIdx.x].x           = apos.x;
+                sA[threadIdx.x].y           = apos.y;
+                sA[threadIdx.x].z           = apos.z;
+                sA[threadIdx.x].q           = cSim.epsfac * apos.w;
+                sA[threadIdx.x].q2          = cSim.preFactor * apos.w;
+                sA[threadIdx.x].sig         = a.x;
+                sA[threadIdx.x].eps         = a.y;
+                sA[threadIdx.x].br          = br;
+
+                
+                for (unsigned int j = 0; j < GRID; j++)
+                {
+                    float dx                = psA[j].x - apos.x; 
+                    float dy                = psA[j].y - apos.y; 
+                    float dz                = psA[j].z - apos.z; 
+                    float r2                = dx * dx + dy * dy + dz * dz; 
+                   
+                    // CDLJ part
+						  
+                    float invR              = 1.0f / sqrt(r2);
+                    float sig               = a.x + psA[j].sig; 
+                    float sig2              = invR * sig; 
+                    sig2                   *= sig2;
+                    float sig6              = sig2 * sig2 * sig2; 
+                    float eps               = a.y * psA[j].eps; 
+                    float dEdR              = eps * (12.0f * sig6 - 6.0f) * sig6; 
+                    dEdR                   += apos.w * psA[j].q * invR; 
+                    dEdR                   *= invR * invR;
+                    if (!(excl & 0x1))
+                    {
+                        dEdR = 0.0f;
+                    } 
+						  
+                  
+//float dEdR = 0.0f;
+
+                    // ObcGbsaForce1 part
+                    float alpha2_ij         = br * psA[j].br;
+                    float D_ij              = r2 / (4.0f * alpha2_ij);
+                    float expTerm           = exp(-D_ij);
+                    float denominator2      = r2 + alpha2_ij * expTerm;
+                    float denominator       = sqrt(denominator2);
+                    float Gpol              = (apos.w * psA[j].q2) / (denominator * denominator2);
+                    float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij); 
+                    af.w                   += dGpol_dalpha2_ij * psA[j].br;   
+                    dEdR                   += Gpol * (1.0f - 0.25f * expTerm); 
+
+                    // Add Forces
+                    dx                     *= dEdR; 
+                    dy                     *= dEdR; 
+                    dz                     *= dEdR; 
+                    af.x                   -= dx; 
+                    af.y                   -= dy; 
+                    af.z                   -= dz; 
+                    excl                  >>= 1;               
+                }
+                
+                // Write results
+                int offset                  = x + tgx + (x >> GRIDBITS) * cSim.stride;
+                cSim.pForce4a[offset]       = af;
+                cSim.pBornForce[offset]     = af.w;
+            }         
+            else        // 100% utilization
+            {
+                // Read fixed atom data into registers and GRF        
+                unsigned int excl           = cSim.pExclusion[x * cSim.exclusionStride + y + tgx];
+                float4 af;
+                sA[threadIdx.x].fx          = af.x = 0.0f;
+                sA[threadIdx.x].fy          = af.y = 0.0f;
+                sA[threadIdx.x].fz          = af.z = 0.0f;
+                sA[threadIdx.x].fb          = af.w = 0.0f;
+                int j                       = y + tgx;
+                float4 temp                 = cSim.pPosq[j];
+                float2 temp1                = cSim.pAttr[j];
+                sA[threadIdx.x].br          = cSim.pBornRadii[j];
+                excl                        = (excl >> tgx) | (excl << (GRID - tgx));                
+                sA[threadIdx.x].x           = temp.x;
+                sA[threadIdx.x].y           = temp.y;
+                sA[threadIdx.x].z           = temp.z;
+                sA[threadIdx.x].q           = cSim.epsfac * temp.w;
+                sA[threadIdx.x].q2          = cSim.preFactor * temp.w;
+                sA[threadIdx.x].sig         = temp1.x;
+                sA[threadIdx.x].eps         = temp1.y;
+                
+                for (j = 0; j < GRID; j++)
+                {
+                    float dx                = psA[tj].x - apos.x; 
+                    float dy                = psA[tj].y - apos.y; 
+                    float dz                = psA[tj].z - apos.z; 
+                    float r2                = dx * dx + dy * dy + dz * dz; 
+                    
+                    // CDLJ part
+                    float invR              = 1.0f / sqrt(r2);
+                    float sig               = a.x + psA[tj].sig; 
+                    float sig2              = invR * sig; 
+                    sig2                   *= sig2;
+                    float sig6              = sig2 * sig2 * sig2; 
+                    float eps               = a.y * psA[tj].eps; 
+						  
+                    float dEdR              = eps * (12.0f * sig6 - 6.0f) * sig6; 
+                    dEdR                   += apos.w * psA[tj].q * invR; 
+                    dEdR                   *= invR * invR;
+                    if (!(excl & 0x1))
+                    {
+                        dEdR = 0.0f;
+                    } 
+						
+   
+//float dEdR = 0.0f;
+                    // ObcGbsaForce1 part
+                    float alpha2_ij         = br * psA[tj].br;
+                    float D_ij              = r2 / (4.0f * alpha2_ij);
+                    float expTerm           = exp(-D_ij);
+                    float denominator2      = r2 + alpha2_ij * expTerm;
+                    float denominator       = sqrt(denominator2);
+                    float Gpol              = (apos.w * psA[tj].q2) / (denominator * denominator2);
+                    float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij); 
+                    af.w                   += dGpol_dalpha2_ij * psA[tj].br;  
+                    psA[tj].fb             += dGpol_dalpha2_ij * br;      
+                    dEdR                   += Gpol * (1.0f - 0.25f * expTerm); 
+                   
+                    // Add forces
+                    dx                     *= dEdR; 
+                    dy                     *= dEdR; 
+                    dz                     *= dEdR; 
+                    af.x                   -= dx; 
+                    af.y                   -= dy; 
+                    af.z                   -= dz;    
+                    psA[tj].fx             += dx; 
+                    psA[tj].fy             += dy; 
+                    psA[tj].fz             += dz;
+                    excl                  >>= 1;
+                    tj                      = sNext[tj]; 
+                }
+                
+                // Write results
+                int offset                  = x + tgx + (y >> GRIDBITS) * cSim.stride;
+                cSim.pForce4a[offset]       = af;
+                cSim.pBornForce[offset]     = af.w;
+                offset                      = y + tgx + (x >> GRIDBITS) * cSim.stride;
+                af.x                        = sA[threadIdx.x].fx;
+                af.y                        = sA[threadIdx.x].fy;
+                af.z                        = sA[threadIdx.x].fz;
+                cSim.pForce4a[offset]       = af;
+                cSim.pBornForce[offset]     = sA[threadIdx.x].fb;
+            }
+        }
+
+        pos -= cSim.nonbond_workBlock;     
+    }
+}
+
+__global__ extern void kCalculateCDLJObcGbsaForces1_12_kernel();
+
+void kCalculateCDLJObcGbsaForces1(gpuContext gpu)
+{
+    //printf("In kCalculateCDLJObcGbsaForces1 QQQ\n");
+
+    // check if Born radii need to be calculated
+
+    if( gpu->bRecalculateBornRadii ){
+	    kCalculateObcGbsaBornSum(gpu);
+		 kReduceObcGbsaBornSum(gpu);
+	 }
+
+    if (gpu->sm_version < SM_12)
+        kCalculateCDLJObcGbsaForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
+    else
+        kCalculateCDLJObcGbsaForces1_12_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
+
+if( 0 ){
+   static int step = 0;
+//   int numPrint    = -1;
+   step++;
+   //WriteArrayToFile1( gpu, "ObcGbsaBornBRad", step, gpu->psBornRadii, numPrint );
+   //gpuDumpCoordinates( gpu );
+	kReduceBornSumAndForces( gpu );
+   gpuDumpObcLoop1( gpu );
+}
+
+    LAUNCHERROR("kCalculateCDLJObcGbsaForces1");
+}
--- a/platforms/cuda/src/kernels/kCalculateCDLJObcGbsaForces1_12.cu
+++ b/platforms/cuda/src/kernels/kCalculateCDLJObcGbsaForces1_12.cu
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#include <cuda.h>
+#include <vector_functions.h>
+#include <cstdlib>
+#include <string>
+#include <iostream>
+#include <fstream>
+using namespace std;
+
+#include "gputypes.h"
+#include "cudatypes.h"
+
+#define UNROLLXX 0
+#define UNROLLXY 0
+
+struct Atom {
+    float x;
+    float y;
+    float z;
+    float q;
+    float sig;
+    float eps;
+    float br;
+    float fx;
+    float fy;
+    float fz;
+    float fb;
+};
+
+
+__shared__ Atom sA[GT2XX_NONBOND_THREADS_PER_BLOCK];
+__shared__ unsigned int sWorkUnit[GT2XX_NONBOND_WORKUNITS_PER_SM];
+__shared__ unsigned int sNext[GRID];
+
+static __constant__ cudaGmxSimulation cSim;
+
+void SetCalculateCDLJObcGbsaForces1_12Sim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
+}
+
+void GetCalculateCDLJObcGbsaForces1_12Sim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
+}
+
+__global__ void kCalculateCDLJObcGbsaForces1_12_kernel()
+{
+    // Read queue of work blocks once so the remainder of
+    // kernel can run asynchronously    
+    int pos = cSim.nbWorkUnitsPerBlock * blockIdx.x + min(blockIdx.x, cSim.nbWorkUnitsPerBlockRemainder);
+    int end = cSim.nbWorkUnitsPerBlock * (blockIdx.x + 1) + min((blockIdx.x + 1), cSim.nbWorkUnitsPerBlockRemainder);    
+    if (threadIdx.x < end - pos)
+    {
+        sWorkUnit[threadIdx.x]              = cSim.pWorkUnit[pos + threadIdx.x];
+    }
+    if (threadIdx.x < GRID)
+    {
+        sNext[threadIdx.x]                  = (threadIdx.x + 1) & (GRID - 1);
+    }
+    __syncthreads();
+
+    // Now change pos and end to reflect work queue just read
+    // into shared memory
+    end                                     = end - pos; 
+    pos                                     = end - (threadIdx.x >> GRIDBITS) - 1;
+       
+    while (pos >= 0)
+    {  
+    
+        // Extract cell coordinates from appropriate work unit
+        unsigned int x                      = sWorkUnit[pos];
+        unsigned int y                      = ((x >> 2) & 0x7fff) << GRIDBITS;
+        bool bExclusionFlag                 = (x & 0x1);
+        x                                   = (x >> 17) << GRIDBITS;
+        unsigned int tgx                    = threadIdx.x & (GRID - 1);
+        unsigned int i                      = x + tgx;
+        float4 apos                         = cSim.pPosq[i];
+        float2 a                            = cSim.pAttr[i];
+        float br                            = cSim.pBornRadii[i];        
+        unsigned int tbx                    = threadIdx.x - tgx;
+        int tj                              = tgx; 
+        Atom* psA                           = &sA[tbx];
+        if (!bExclusionFlag)
+        {
+            if (x == y) // Handle diagonals uniquely at 50% efficiency
+            { 
+                // Read fixed atom data into registers and GRF
+                sA[threadIdx.x].x           = apos.x;
+                sA[threadIdx.x].y           = apos.y;
+                sA[threadIdx.x].z           = apos.z;
+                sA[threadIdx.x].q           = apos.w;
+                float q2                    = cSim.preFactor * apos.w;
+                apos.w                     *= cSim.epsfac;
+                sA[threadIdx.x].sig         = a.x;
+                sA[threadIdx.x].eps         = a.y;
+                sA[threadIdx.x].br          = br; 
+                float4 af;
+                af.x                        = 0.0f;
+                af.y                        = 0.0f;
+                af.z                        = 0.0f;
+                af.w                        = 0.0f;
+                for (unsigned int j = 0; j < GRID; j++)
+                {
+                    float dx                = psA[j].x - apos.x; 
+                    float dy                = psA[j].y - apos.y; 
+                    float dz                = psA[j].z - apos.z; 
+                    float r2                = dx * dx + dy * dy + dz * dz; 
+                                     
+                    // CDLJ part
+                    float invR              = 1.0f / sqrt(r2);
+                    float sig               = a.x + psA[j].sig; 
+                    float sig2              = invR * sig; 
+                    sig2                   *= sig2;
+                    float sig6              = sig2 * sig2 * sig2; 
+                    float eps               = a.y * psA[j].eps; 
+                    float dEdR              = eps * (12.0f * sig6 - 6.0f) * sig6; 
+                    dEdR                   += apos.w * psA[j].q * invR; 
+                    dEdR                   *= invR * invR; 
+
+                    // ObcGbsaForce1 part
+                    float alpha2_ij         = br * psA[j].br;
+                    float D_ij              = r2 / (4.0f * alpha2_ij);
+                    float expTerm           = exp(-D_ij);
+                    float denominator2      = r2 + alpha2_ij * expTerm;
+                    float denominator       = sqrt(denominator2);
+                    float Gpol              = (q2 * psA[j].q) / (denominator * denominator2);
+                    float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij); 
+                    af.w                   += dGpol_dalpha2_ij * psA[j].br;   
+                    dEdR                   += Gpol * (1.0f - 0.25f * expTerm); 
+                    
+                    // Add Forces
+                    dx                     *= dEdR; 
+                    dy                     *= dEdR; 
+                    dz                     *= dEdR; 
+                    af.x                   -= dx; 
+                    af.y                   -= dy; 
+                    af.z                   -= dz; 
+                }
+                
+                // Write results
+                int offset                  = x + tgx + (x >> GRIDBITS) * cSim.stride;
+                cSim.pForce4a[offset]       = af;
+                cSim.pBornForce[offset]     = af.w;
+            }         
+            else        // 100% utilization
+            {
+                // Read fixed atom data into registers and GRF
+                int j                       = y + tgx;
+                float4 temp                 = cSim.pPosq[j];
+                float2 temp1                = cSim.pAttr[j];
+                sA[threadIdx.x].br          = cSim.pBornRadii[j];
+                float4 af;
+                sA[threadIdx.x].fx          = af.x = 0.0f;
+                sA[threadIdx.x].fy          = af.y = 0.0f;
+                sA[threadIdx.x].fz          = af.z = 0.0f;
+                sA[threadIdx.x].fb          = af.w = 0.0f;
+                float q2                    = apos.w * cSim.preFactor;
+                apos.w                     *= cSim.epsfac;                
+                sA[threadIdx.x].x           = temp.x;
+                sA[threadIdx.x].y           = temp.y;
+                sA[threadIdx.x].z           = temp.z;
+                sA[threadIdx.x].q           = temp.w;
+                sA[threadIdx.x].sig         = temp1.x;
+                sA[threadIdx.x].eps         = temp1.y;
+   
+                for (j = 0; j < GRID; j++)
+                {
+                    float dx                = psA[tj].x - apos.x; 
+                    float dy                = psA[tj].y - apos.y; 
+                    float dz                = psA[tj].z - apos.z; 
+                    float r2                = dx * dx + dy * dy + dz * dz; 
+                    
+                    // CDLJ part
+                    float invR              = 1.0f / sqrt(r2);
+                    float sig               = a.x + psA[tj].sig; 
+                    float sig2              = invR * sig; 
+                    sig2                   *= sig2;
+                    float sig6              = sig2 * sig2 * sig2; 
+                    float eps               = a.y * psA[tj].eps; 
+                    float dEdR              = eps * (12.0f * sig6 - 6.0f) * sig6; 
+                    dEdR                   += apos.w * psA[tj].q * invR; 
+                    dEdR                   *= invR * invR; 
+                  
+                    // ObcGbsaForce1 part
+                    float alpha2_ij         = br * psA[tj].br;
+                    float D_ij              = r2 / (4.0f * alpha2_ij);
+                    float expTerm           = exp(-D_ij);
+                    float denominator2      = r2 + alpha2_ij * expTerm;
+                    float denominator       = sqrt(denominator2);
+                    float Gpol              = (q2 * psA[tj].q) / (denominator * denominator2);
+                    float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij); 
+                    af.w                   += dGpol_dalpha2_ij * psA[tj].br;  
+                    psA[tj].fb             += dGpol_dalpha2_ij * br;      
+                    dEdR                   += Gpol * (1.0f - 0.25f * expTerm); 
+                    
+                    // Add forces
+                    dx                     *= dEdR; 
+                    dy                     *= dEdR; 
+                    dz                     *= dEdR; 
+                    af.x                   -= dx; 
+                    af.y                   -= dy; 
+                    af.z                   -= dz;    
+                    psA[tj].fx             += dx; 
+                    psA[tj].fy             += dy; 
+                    psA[tj].fz             += dz;
+                    tj                      = sNext[tj]; 
+                }
+                
+                // Write results
+                int offset                  = x + tgx + (y >> GRIDBITS) * cSim.stride;
+                cSim.pForce4a[offset]       = af;
+                cSim.pBornForce[offset]     = af.w;
+                af.x                        = sA[threadIdx.x].fx;
+                af.y                        = sA[threadIdx.x].fy;
+                af.z                        = sA[threadIdx.x].fz;
+                offset                      = y + tgx + (x >> GRIDBITS) * cSim.stride;
+                cSim.pForce4a[offset]       = af;
+                cSim.pBornForce[offset]     = sA[threadIdx.x].fb;
+            }
+        }
+        else  // bExclusion
+        {
+            // Read exclusion data
+            
+            if (x == y) // Handle diagonals uniquely at 50% efficiency
+            { 
+                // Read fixed atom data into registers and GRF
+                unsigned int excl           = cSim.pExclusion[x * cSim.exclusionStride + y + tgx];
+                float4 af;
+                af.x                        = 0.0f;
+                af.y                        = 0.0f;
+                af.z                        = 0.0f;
+                af.w                        = 0.0f;                                      
+                sA[threadIdx.x].x           = apos.x;
+                sA[threadIdx.x].y           = apos.y;
+                sA[threadIdx.x].z           = apos.z;
+                sA[threadIdx.x].q           = apos.w;
+                float q2                    = cSim.preFactor * apos.w;
+                apos.w                     *= cSim.epsfac;
+                sA[threadIdx.x].sig         = a.x;
+                sA[threadIdx.x].eps         = a.y;
+                sA[threadIdx.x].br          = br;
+
+                
+                for (unsigned int j = 0; j < GRID; j++)
+                {
+                    float dx                = psA[j].x - apos.x; 
+                    float dy                = psA[j].y - apos.y; 
+                    float dz                = psA[j].z - apos.z; 
+                    float r2                = dx * dx + dy * dy + dz * dz; 
+                   
+                    // CDLJ part
+                    float invR              = 1.0f / sqrt(r2);
+                    float sig               = a.x + psA[j].sig; 
+                    float sig2              = invR * sig; 
+                    sig2                   *= sig2;
+                    float sig6              = sig2 * sig2 * sig2; 
+                    float eps               = a.y * psA[j].eps; 
+                    float dEdR              = eps * (12.0f * sig6 - 6.0f) * sig6; 
+                    dEdR                   += apos.w * psA[j].q * invR; 
+                    dEdR                   *= invR * invR;
+                    if (!(excl & 0x1))
+                    {
+                        dEdR = 0.0f;
+                    } 
+                  
+                    // ObcGbsaForce1 part
+                    float alpha2_ij         = br * psA[j].br;
+                    float D_ij              = r2 / (4.0f * alpha2_ij);
+                    float expTerm           = exp(-D_ij);
+                    float denominator2      = r2 + alpha2_ij * expTerm;
+                    float denominator       = sqrt(denominator2);
+                    float Gpol              = (q2 * psA[j].q) / (denominator * denominator2);
+                    float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij); 
+                    af.w                   += dGpol_dalpha2_ij * psA[j].br;   
+                    dEdR                   += Gpol * (1.0f - 0.25f * expTerm); 
+
+                    // Add Forces
+                    dx                     *= dEdR; 
+                    dy                     *= dEdR; 
+                    dz                     *= dEdR; 
+                    af.x                   -= dx; 
+                    af.y                   -= dy; 
+                    af.z                   -= dz; 
+                    excl                  >>= 1;               
+                }
+                
+                // Write results
+                int offset                  = x + tgx + (x >> GRIDBITS) * cSim.stride;
+                cSim.pForce4a[offset]       = af;
+                cSim.pBornForce[offset]     = af.w;
+            }         
+            else        // 100% utilization
+            {
+                // Read fixed atom data into registers and GRF        
+                unsigned int excl           = cSim.pExclusion[x * cSim.exclusionStride + y + tgx];
+                float4 af;
+                sA[threadIdx.x].fx          = af.x = 0.0f;
+                sA[threadIdx.x].fy          = af.y = 0.0f;
+                sA[threadIdx.x].fz          = af.z = 0.0f;
+                sA[threadIdx.x].fb          = af.w = 0.0f;
+                int j                       = y + tgx;
+                float q2                    = cSim.preFactor * apos.w;
+                apos.w                     *= cSim.epsfac;
+                float4 temp                 = cSim.pPosq[j];
+                float2 temp1                = cSim.pAttr[j];
+                sA[threadIdx.x].br          = cSim.pBornRadii[j];
+                excl                        = (excl >> tgx) | (excl << (GRID - tgx));                
+                sA[threadIdx.x].x           = temp.x;
+                sA[threadIdx.x].y           = temp.y;
+                sA[threadIdx.x].z           = temp.z;
+                sA[threadIdx.x].q           = temp.w;
+                sA[threadIdx.x].sig         = temp1.x;
+                sA[threadIdx.x].eps         = temp1.y;
+                
+                for (j = 0; j < GRID; j++)
+                {
+                    float dx                = psA[tj].x - apos.x; 
+                    float dy                = psA[tj].y - apos.y; 
+                    float dz                = psA[tj].z - apos.z; 
+                    float r2                = dx * dx + dy * dy + dz * dz; 
+                    
+                    // CDLJ part
+                    float invR              = 1.0f / sqrt(r2);
+                    float sig               = a.x + psA[tj].sig; 
+                    float sig2              = invR * sig; 
+                    sig2                   *= sig2;
+                    float sig6              = sig2 * sig2 * sig2; 
+                    float eps               = a.y * psA[tj].eps; 
+                    float dEdR              = eps * (12.0f * sig6 - 6.0f) * sig6; 
+                    dEdR                   += apos.w * psA[tj].q * invR; 
+                    dEdR                   *= invR * invR;
+                    if (!(excl & 0x1))
+                    {
+                        dEdR = 0.0f;
+                    } 
+   
+                    // ObcGbsaForce1 part
+                    float alpha2_ij         = br * psA[tj].br;
+                    float D_ij              = r2 / (4.0f * alpha2_ij);
+                    float expTerm           = exp(-D_ij);
+                    float denominator2      = r2 + alpha2_ij * expTerm;
+                    float denominator       = sqrt(denominator2);
+                    float Gpol              = (q2 * psA[tj].q) / (denominator * denominator2);
+                    float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij); 
+                    af.w                   += dGpol_dalpha2_ij * psA[tj].br;  
+                    psA[tj].fb             += dGpol_dalpha2_ij * br;      
+                    dEdR                   += Gpol * (1.0f - 0.25f * expTerm); 
+                   
+                    // Add forces
+                    dx                     *= dEdR; 
+                    dy                     *= dEdR; 
+                    dz                     *= dEdR; 
+                    af.x                   -= dx; 
+                    af.y                   -= dy; 
+                    af.z                   -= dz;    
+                    psA[tj].fx             += dx; 
+                    psA[tj].fy             += dy; 
+                    psA[tj].fz             += dz;
+                    excl                  >>= 1;
+                    tj                      = sNext[tj]; 
+                }
+                
+                // Write results
+                int offset                  = x + tgx + (y >> GRIDBITS) * cSim.stride;
+                cSim.pForce4a[offset]       = af;
+                cSim.pBornForce[offset]     = af.w;
+                offset                      = y + tgx + (x >> GRIDBITS) * cSim.stride;
+                af.x                        = sA[threadIdx.x].fx;
+                af.y                        = sA[threadIdx.x].fy;
+                af.z                        = sA[threadIdx.x].fz;
+                cSim.pForce4a[offset]       = af;
+                cSim.pBornForce[offset]     = sA[threadIdx.x].fb;
+            }
+        }
+
+        pos -= cSim.nonbond_workBlock;     
+    }
+}
+
+void kCalculateCDLJObcGbsaForces1_12(gpuContext gpu)
+{
+//    printf("kCalculateCDLJObcGbsaForces1_12\n");
+    kCalculateCDLJObcGbsaForces1_12_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
+    LAUNCHERROR("kCalculateCDLJObcGbsaForces1_12");
+}
--- a/platforms/cuda/src/kernels/kCalculateLocalForces.cu
+++ b/platforms/cuda/src/kernels/kCalculateLocalForces.cu
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#include <cuda.h>
+#include <vector_functions.h>
+#include <cstdlib>
+#include <string>
+#include <iostream>
+#include <fstream>
+using namespace std;
+
+#include "gputypes.h"
+
+extern __shared__ Vectors sV[];
+static __constant__ cudaGmxSimulation cSim;
+
+
+#define DOT3(v1, v2) (v1.x * v2.x + v1.y * v2.y + v1.z * v2.z)
+
+#define GETNORMEDDOTPRODUCT(v1, v2, dp) \
+{ \
+    dp          = DOT3(v1, v2); \
+    float norm1 = DOT3(v1, v1); \
+    float norm2 = DOT3(v2, v2); \
+    dp /= sqrt(norm1 * norm2); \
+    dp = min(dp, 1.0f); \
+    dp = max(dp, -1.0f); \
+}
+
+#define CROSS_PRODUCT(v1, v2, c) \
+    c.x = v1.y * v2.z - v1.z * v2.y; \
+    c.y = v1.z * v2.x - v1.x * v2.z; \
+    c.z = v1.x * v2.y - v1.y * v2.x;
+
+#define GETPREFACTORSGIVENANGLECOSINE(cosine, param, dEdR) \
+{ \
+   float angle          = acos(cosine); \
+   float deltaIdeal     = angle - (param.x * (3.14159265f / 180.0f)); \
+   dEdR                 = param.y * deltaIdeal; \
+}
+
+#define GETANGLEBETWEENTWOVECTORS(v1, v2, angle) \
+{ \
+    float dp; \
+    GETNORMEDDOTPRODUCT(v1, v2, dp); \
+    angle = acos(dp); \
+}
+
+#define GETANGLECOSINEBETWEENTWOVECTORS(v1, v2, angle, cosine) \
+{ \
+    GETNORMEDDOTPRODUCT(v1, v2, cosine); \
+    angle = acos(cosine); \
+}
+
+#define GETDIHEDRALANGLEBETWEENTHREEVECTORS(vector1, vector2, vector3, signVector, cp0, cp1, angle) \
+{ \
+    CROSS_PRODUCT(vector1, vector2, cp0); \
+    CROSS_PRODUCT(vector2, vector3, cp1); \
+    GETANGLEBETWEENTWOVECTORS(cp0, cp1, angle); \
+    float dp = DOT3(signVector, cp1); \
+    angle = (dp >= 0) ? angle : -angle; \
+}                                                          
+
+#define GETDIHEDRALANGLECOSINEBETWEENTHREEVECTORS(vector1, vector2, vector3, signVector, cp0, cp1, angle, cosine) \
+{ \
+    CROSS_PRODUCT(vector1, vector2, cp0); \
+    CROSS_PRODUCT(vector2, vector3, cp1); \
+    GETANGLECOSINEBETWEENTWOVECTORS(cp0, cp1, angle, cosine); \
+    float dp = DOT3(signVector, cp1); \
+    angle = (dp >= 0) ? angle : -angle; \
+}
+
+void SetCalculateLocalForcesSim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
+}
+
+void GetCalculateLocalForcesSim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
+}
+    
+
+__global__ void kCalculateLocalForces_kernel()
+{
+    unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
+    Vectors* A = &sV[threadIdx.x];
+
+    while (pos < cSim.bond_offset)
+    {
+        if (pos < cSim.bonds)
+        {
+            int4   atom         = cSim.pBondID[pos];
+            float4 atomA        = cSim.pPosq[atom.x];
+            float4 atomB        = cSim.pPosq[atom.y];
+            float2 bond         = cSim.pBondParameter[pos];
+            float dx            = atomB.x - atomA.x;
+            float dy            = atomB.y - atomA.y;
+            float dz            = atomB.z - atomA.z;
+            float r2            = dx * dx + dy * dy + dz * dz;
+            float r             = sqrt(r2);
+            float deltaIdeal    = r - bond.x;
+            float dEdR          = bond.y * deltaIdeal;
+            dEdR                = (r > 0.0f) ? (dEdR / r) : 0.0f;
+//            printf("D: %11.4f %11.4f %11.4f %11.4f %11.4f %11.4f\n", dx, dy, dz, r, deltaIdeal, dEdR);
+            dx                 *= dEdR;
+            dy                 *= dEdR;
+            dz                 *= dEdR;
+            unsigned int offsetA                = atom.x + atom.z * cSim.stride;
+            unsigned int offsetB                = atom.y + atom.w * cSim.stride;
+            float4 forceA                       = {0.0f, 0.0f, 0.0f, 0.0f};
+            if (atom.z < cSim.totalNonbondOutputBuffers)
+                forceA                          = cSim.pForce4[offsetA];
+            float4 forceB                       = {0.0f, 0.0f, 0.0f, 0.0f};
+            if (atom.w < cSim.totalNonbondOutputBuffers)
+                forceB                          = cSim.pForce4[offsetB];
+            forceA.x                           += dx;
+            forceA.y                           += dy;
+            forceA.z                           += dz;
+            forceB.x                           -= dx;
+            forceB.y                           -= dy;
+            forceB.z                           -= dz;
+            cSim.pForce4[offsetA]               = forceA;
+            cSim.pForce4[offsetB]               = forceB;    
+        }
+        pos += blockDim.x * gridDim.x;
+    }
+  
+    while (pos < cSim.bond_angle_offset)
+    {
+        unsigned int pos1   = pos - cSim.bond_offset;
+        if (pos1 < cSim.bond_angles)
+        {
+            int4   atom1            = cSim.pBondAngleID1[pos1];  
+            float2 bond_angle       = cSim.pBondAngleParameter[pos1];
+            float4 a1               = cSim.pPosq[atom1.x];
+            float4 a2               = cSim.pPosq[atom1.y];
+            float4 a3               = cSim.pPosq[atom1.z];
+            A->v0.x                 = a2.x - a1.x;
+            A->v0.y                 = a2.y - a1.y;
+            A->v0.z                 = a2.z - a1.z;
+            A->v1.x                 = a2.x - a3.x;
+            A->v1.y                 = a2.y - a3.y;
+            A->v1.z                 = a2.z - a3.z;
+            float3 cp;
+            CROSS_PRODUCT(A->v0, A->v1, cp);
+            float rp                = DOT3(cp, cp); //cx * cx + cy * cy + cz * cz;
+            rp                      = max(sqrt(rp), 1.0e-06f);
+            float r21               = DOT3(A->v0, A->v0); // dx1 * dx1 + dy1 * dy1 + dz1 * dz1;
+            float r23               = DOT3(A->v1, A->v1); // dx2 * dx2 + dy2 * dy2 + dz2 * dz2;
+            float dot               = DOT3(A->v0, A->v1); // dx1 * dx2 + dy1 * dy2 + dz1 * dz2;
+            float cosine            = dot / sqrt(r21 * r23);
+            float dEdR;
+            GETPREFACTORSGIVENANGLECOSINE(cosine, bond_angle, dEdR);
+            //printf("%11.4f %11.4f\n", cosine, dEdR);
+            float termA             =  dEdR / (r21 * rp);
+            float termC             = -dEdR / (r23 * rp);
+            float3 c21;
+            float3 c23;
+            CROSS_PRODUCT(A->v0, cp, c21);
+            CROSS_PRODUCT(A->v1, cp, c23);
+            c21.x                  *= termA;
+            c21.y                  *= termA;
+            c21.z                  *= termA;
+            c23.x                  *= termC;
+            c23.y                  *= termC;
+            c23.z                  *= termC;
+            int2 atom2              = cSim.pBondAngleID2[pos1];
+            unsigned int offset     = atom1.x + atom1.w * cSim.stride;
+            float4 force            = {0.0f, 0.0f, 0.0f, 0.0f}; 
+            if (atom1.w < cSim.totalNonbondOutputBuffers)
+                force               = cSim.pForce4[offset]; 
+            force.x                += c21.x;
+            force.y                += c21.y;
+            force.z                += c21.z;
+            cSim.pForce4[offset]    = force;
+            offset                  = atom1.y + atom2.x * cSim.stride;
+            force.x                 = force.y = force.z = 0.0f; 
+            if (atom2.x < cSim.totalNonbondOutputBuffers)
+                force               = cSim.pForce4[offset]; 
+            force.x                -= (c21.x + c23.x);
+            force.y                -= (c21.y + c23.y);
+            force.z                -= (c21.z + c23.z);
+            cSim.pForce4[offset]    = force;
+            offset                  = atom1.z + atom2.y * cSim.stride;
+            force.x                 = force.y = force.z = 0.0f; 
+            if (atom2.y < cSim.totalNonbondOutputBuffers)
+                force               = cSim.pForce4[offset]; 
+            force.x                += c23.x;
+            force.y                += c23.y;
+            force.z                += c23.z;
+            cSim.pForce4[offset]    = force;
+        }
+        pos += blockDim.x * gridDim.x;
+    }
+            
+    while (pos < cSim.dihedral_offset)
+    {
+        unsigned int pos1 = pos - cSim.bond_angle_offset;
+        if (pos1 < cSim.dihedrals)
+        {
+            int4   atom1        = cSim.pDihedralID1[pos1];  
+            float4 atomA        = cSim.pPosq[atom1.x];
+            float4 atomB        = cSim.pPosq[atom1.y];
+            float4 atomC        = cSim.pPosq[atom1.z];
+            float4 atomD        = cSim.pPosq[atom1.w];            
+            A->v0.x             = atomA.x - atomB.x;
+            A->v0.y             = atomA.y - atomB.y;
+            A->v0.z             = atomA.z - atomB.z;
+            A->v1.x             = atomC.x - atomB.x;
+            A->v1.y             = atomC.y - atomB.y;
+            A->v1.z             = atomC.z - atomB.z;
+            A->v2.x             = atomC.x - atomD.x;
+            A->v2.y             = atomC.y - atomD.y;
+            A->v2.z             = atomC.z - atomD.z; 
+            float3 cp0, cp1;
+            float dihedralAngle;
+            GETDIHEDRALANGLEBETWEENTHREEVECTORS(A->v0, A->v1, A->v2, A->v0, cp0, cp1, dihedralAngle);
+            float4 dihedral         = cSim.pDihedralParameter[pos1];
+            float deltaAngle        = dihedral.z * dihedralAngle - (dihedral.y * 3.14159265f / 180.0f);
+            float sinDeltaAngle     = sin(deltaAngle);
+            float dEdAngle          = -dihedral.x * dihedral.z * sinDeltaAngle;
+            float normCross1        = DOT3(cp0, cp0);
+            float normBC            = sqrt(DOT3(A->v1, A->v1));
+            float4 ff;
+            ff.x                    = (-dEdAngle * normBC) / normCross1;
+            float normCross2        = DOT3(cp1, cp1);
+            ff.w                    = (dEdAngle * normBC) / normCross2;
+            float dp                = 1.0f / DOT3(A->v1, A->v1);
+            ff.y                    = DOT3(A->v0, A->v1) * dp;
+            ff.z                    = DOT3(A->v2, A->v1) * dp;
+            int4  atom2             = cSim.pDihedralID2[pos1];   
+            float3 internalF0;
+            float3 internalF3;
+            float3 s;
+            
+//            printf("%4d: %9.4f %9.4f %9.4f %9.4f\n", pos1, ff.x, ff.y, ff.z, ff.w);  
+            unsigned int offset                 = atom1.x + atom2.x * cSim.stride;
+            float4 force                        = {0.0f, 0.0f, 0.0f, 0.0f}; 
+            if (atom2.x < cSim.totalNonbondOutputBuffers)
+                force                           = cSim.pForce4[offset]; 
+            internalF0.x                        = ff.x * cp0.x; 
+            force.x                            += internalF0.x;
+            internalF0.y                        = ff.x * cp0.y;
+            force.y                            += internalF0.y;
+            internalF0.z                        = ff.x * cp0.z;       
+            force.z                            += internalF0.z;
+            cSim.pForce4[offset]                = force;
+            
+            //printf("%4d - 0: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
+            offset                              = atom1.w + atom2.w * cSim.stride;
+            force.x                             = force.y = force.z = 0.0f;  
+            if (atom2.w < cSim.totalNonbondOutputBuffers)
+                force                           = cSim.pForce4[offset]; 
+            internalF3.x                        = ff.w * cp1.x;
+            force.x                            += internalF3.x;
+            internalF3.y                        = ff.w * cp1.y;
+            force.y                            += internalF3.y;
+            internalF3.z                        = ff.w * cp1.z;
+            force.z                            += internalF3.z;
+            cSim.pForce4[offset]                = force;
+            
+           // printf("%4d - 3: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
+            s.x                                 = ff.y * internalF0.x - ff.z * internalF3.x;   
+            s.y                                 = ff.y * internalF0.y - ff.z * internalF3.y;  
+            s.z                                 = ff.y * internalF0.z - ff.z * internalF3.z;        
+            offset                              = atom1.y + atom2.y * cSim.stride;
+            force.x                             = force.y = force.z = 0.0f; 
+            if (atom2.y < cSim.totalNonbondOutputBuffers)
+                force                           = cSim.pForce4[offset]; 
+            force.x                            += -internalF0.x + s.x;
+            force.y                            += -internalF0.y + s.y;
+            force.z                            += -internalF0.z + s.z;
+            cSim.pForce4[offset]                = force;
+            
+            //printf("%4d - 1: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
+            offset                              = atom1.z + atom2.z * cSim.stride;
+            force.x                             = force.y = force.z = 0.0f; 
+            if (atom2.z < cSim.totalNonbondOutputBuffers)
+                force                           = cSim.pForce4[offset]; 
+            force.x                            += -internalF3.x - s.x;
+            force.y                            += -internalF3.y - s.y;
+            force.z                            += -internalF3.z - s.z;
+            cSim.pForce4[offset]                = force;
+            //printf("%4d - 2: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
+        }        
+        pos += blockDim.x * gridDim.x;
+    }
+
+    while (pos < cSim.rb_dihedral_offset)
+    {
+        unsigned int pos1 = pos - cSim.dihedral_offset;
+        if (pos1 < cSim.rb_dihedrals)
+        {
+            int4   atom1        = cSim.pRbDihedralID1[pos1];  
+            float4 atomA        = cSim.pPosq[atom1.x];
+            float4 atomB        = cSim.pPosq[atom1.y];
+            float4 atomC        = cSim.pPosq[atom1.z];
+            float4 atomD        = cSim.pPosq[atom1.w];            
+            A->v0.x             = atomA.x - atomB.x;
+            A->v0.y             = atomA.y - atomB.y;
+            A->v0.z             = atomA.z - atomB.z;
+            A->v1.x             = atomC.x - atomB.x;
+            A->v1.y             = atomC.y - atomB.y;
+            A->v1.z             = atomC.z - atomB.z;
+            A->v2.x             = atomC.x - atomD.x;
+            A->v2.y             = atomC.y - atomD.y;
+            A->v2.z             = atomC.z - atomD.z; 
+            float3 cp0, cp1;
+            float dihedralAngle, cosPhi;
+      //      printf("%4d - 0 : %9.4f %9.4f %9.4f\n", pos1, A->v0.x, A->v0.y, A->v0.z); 
+      //      printf("%4d - 1 : %9.4f %9.4f %9.4f\n", pos1, A->v1.x, A->v1.y, A->v1.z); 
+      //      printf("%4d - 2 : %9.4f %9.4f %9.4f\n", pos1, A->v2.x, A->v2.y, A->v2.z);  
+            GETDIHEDRALANGLECOSINEBETWEENTHREEVECTORS(A->v0, A->v1, A->v2, A->v0, cp0, cp1, dihedralAngle, cosPhi);
+            if (dihedralAngle < 0.0f )
+            {
+                dihedralAngle += 3.14159265f;
+            } 
+            else 
+            {
+                dihedralAngle -= 3.14159265f;
+            }
+            cosPhi                  = -cosPhi;
+         //   printf("%4d: %9.4f %9.4f\n", pos1, dihedralAngle, cosPhi);
+            float4 dihedral1        = cSim.pRbDihedralParameter1[pos1];
+            float2 dihedral2        = cSim.pRbDihedralParameter2[pos1];
+            float cosFactor         = cosPhi;
+            float dEdAngle          = -dihedral1.y;
+        //    printf("%4d - 1: %9.4f %9.4f\n", pos1, dEdAngle, 1.0f);
+            dEdAngle               -= 2.0f * dihedral1.z * cosFactor;
+       //     printf("%4d - 2: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
+            cosFactor              *= cosPhi;
+            dEdAngle               -= 3.0f * dihedral1.w * cosFactor;
+     //       printf("%4d - 3: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
+            cosFactor              *= cosPhi;
+            dEdAngle               -= 4.0f * dihedral2.x * cosFactor;
+   //         printf("%4d - 4: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
+            cosFactor              *= cosPhi;
+            dEdAngle               -= 5.0f * dihedral2.y * cosFactor;
+ //           printf("%4d - 5: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
+            dEdAngle               *= sin(dihedralAngle);  
+//            printf("%4d - f: %9.4f\n", pos1, dEdAngle);
+            
+            float normCross1        = DOT3(cp0, cp0);
+            float normBC            = sqrt(DOT3(A->v1, A->v1));
+            float4 ff;
+            ff.x                    = (-dEdAngle * normBC) / normCross1;
+            float normCross2        = DOT3(cp1, cp1);
+            ff.w                    = (dEdAngle * normBC) / normCross2;
+            float dp                = 1.0f / DOT3(A->v1, A->v1);
+            ff.y                    = DOT3(A->v0, A->v1) * dp;
+            ff.z                    = DOT3(A->v2, A->v1) * dp;
+            int4  atom2             = cSim.pRbDihedralID2[pos1];   
+            float3 internalF0;
+            float3 internalF3;
+            float3 s;
+            
+//            printf("%4d: %9.4f %9.4f %9.4f %9.4f\n", pos1, ff.x, ff.y, ff.z, ff.w);  
+            unsigned int offset                 = atom1.x + atom2.x * cSim.stride;
+            float4 force                        = {0.0f, 0.0f, 0.0f, 0.0f}; 
+            if (atom2.x < cSim.totalNonbondOutputBuffers)
+                force                           = cSim.pForce4[offset]; 
+            internalF0.x                        = ff.x * cp0.x; 
+            force.x                            += internalF0.x;
+            internalF0.y                        = ff.x * cp0.y;
+            force.y                            += internalF0.y;
+            internalF0.z                        = ff.x * cp0.z;       
+            force.z                            += internalF0.z;
+            cSim.pForce4[offset]                = force;
+            
+ //           printf("%4d - 0: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
+            offset                              = atom1.w + atom2.w * cSim.stride;
+            force.x                             = force.y = force.z = 0.0f; 
+            if (atom2.w < cSim.totalNonbondOutputBuffers)
+                force                           = cSim.pForce4[offset]; 
+            internalF3.x                        = ff.w * cp1.x;
+            force.x                            += internalF3.x;
+            internalF3.y                        = ff.w * cp1.y;
+            force.y                            += internalF3.y;
+            internalF3.z                        = ff.w * cp1.z;
+            force.z                            += internalF3.z;
+            cSim.pForce4[offset]                = force;
+            
+   //         printf("%4d - 3: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
+            s.x                                 = ff.y * internalF0.x - ff.z * internalF3.x;   
+            s.y                                 = ff.y * internalF0.y - ff.z * internalF3.y;  
+            s.z                                 = ff.y * internalF0.z - ff.z * internalF3.z;        
+            offset                              = atom1.y + atom2.y * cSim.stride;
+            force.x                             = force.y = force.z = 0.0f; 
+            if (atom2.y < cSim.totalNonbondOutputBuffers)
+                force                           = cSim.pForce4[offset]; 
+            force.x                            += -internalF0.x + s.x;
+            force.y                            += -internalF0.y + s.y;
+            force.z                            += -internalF0.z + s.z;
+            cSim.pForce4[offset]                = force;
+     //       printf("%4d - 1: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
+            offset                              = atom1.z + atom2.z * cSim.stride;
+            force.x                             = force.y = force.z = 0.0f; 
+            if (atom2.z < cSim.totalNonbondOutputBuffers)
+                force                           = cSim.pForce4[offset]; 
+            force.x                            += -internalF3.x - s.x;
+            force.y                            += -internalF3.y - s.y;
+            force.z                            += -internalF3.z - s.z;
+            cSim.pForce4[offset]                = force;
+     //       printf("%4d - 2: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
+        }            
+        pos += blockDim.x * gridDim.x;
+    }   
+
+    while (pos < cSim.LJ14_offset)
+    {  
+        unsigned int pos1       = pos - cSim.rb_dihedral_offset;
+        if (pos1 < cSim.LJ14s)
+        {
+            int4 atom               = cSim.pLJ14ID[pos1];
+            float4 LJ14             = cSim.pLJ14Parameter[pos1];
+            float4 a1               = cSim.pPosq[atom.x];
+            float4 a2               = cSim.pPosq[atom.y];
+            float3 d;
+            d.x                     = a1.x - a2.x;
+            d.y                     = a1.y - a2.y;
+            d.z                     = a1.z - a2.z;
+            float r2                = DOT3(d, d);
+            float inverseR          = 1.0f / sqrt(r2);
+            float sig2              = inverseR * LJ14.y;
+            sig2                   *= sig2;
+            float sig6              = sig2 * sig2 * sig2;
+            float dEdR              = LJ14.x * (12.0f * sig6 - 6.0f) * sig6;
+            dEdR                   += LJ14.z * inverseR;
+            dEdR                   *= inverseR * inverseR;
+            unsigned int offsetA    = atom.x + atom.z * cSim.stride;
+            unsigned int offsetB    = atom.y + atom.w * cSim.stride;
+            float4 forceA           = {0.0f, 0.0f, 0.0f, 0.0f}; 
+            if (atom.z < cSim.totalNonbondOutputBuffers)
+                forceA              = cSim.pForce4[offsetA]; 
+            float4 forceB           = {0.0f, 0.0f, 0.0f, 0.0f}; 
+            if (atom.w < cSim.totalNonbondOutputBuffers)
+                forceB              = cSim.pForce4[offsetB]; 
+            d.x                    *= dEdR;
+            d.y                    *= dEdR;
+            d.z                    *= dEdR;
+            forceA.x               += d.x;
+            forceA.y               += d.y;
+            forceA.z               += d.z;
+            forceB.x               -= d.x;
+            forceB.y               -= d.y;
+            forceB.z               -= d.z;        
+            cSim.pForce4[offsetA]   = forceA;
+            cSim.pForce4[offsetB]   = forceB;
+        }        
+        pos                    += blockDim.x * gridDim.x;
+    }
+
+}
+
+void kCalculateLocalForces(gpuContext gpu)
+{
+  //  printf("kCalculateLocalForces\n");
+    kCalculateLocalForces_kernel<<<gpu->sim.blocks, gpu->sim.localForces_threads_per_block, gpu->sim.localForces_threads_per_block * sizeof(Vectors)>>>();
+    LAUNCHERROR("kCalculateLocalForces");
+}
+
--- a/platforms/cuda/src/kernels/kCalculateObcGbsaBornSum.cu
+++ b/platforms/cuda/src/kernels/kCalculateObcGbsaBornSum.cu
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#include <cuda.h>
+#include <vector_functions.h>
+#include <cstdlib>
+#include <string>
+#include <iostream>
+#include <fstream>
+using namespace std;
+
+#include "gputypes.h"
+
+#define UNROLLXX 0
+#define UNROLLXY 0
+
+struct Atom {
+    float x;
+    float y;
+    float z;
+    float r;
+    float sr;
+    float sum;
+    float junk;
+};
+
+__shared__ Atom sA[GT2XX_NONBOND_THREADS_PER_BLOCK];
+__shared__ unsigned int sWorkUnit[GT2XX_NONBOND_WORKUNITS_PER_SM];
+__shared__ unsigned int sNext[GRID];
+
+static __constant__ cudaGmxSimulation cSim;
+
+void SetCalculateObcGbsaBornSumSim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
+}
+
+void GetCalculateObcGbsaBornSumSim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
+}
+
+__global__ void kReduceObcGbsaBornSum_kernel()
+{
+    unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
+    
+    while (pos < cSim.atoms)
+    {
+        float sum = 0.0f;
+        float* pSt = cSim.pBornSum + pos;
+        float2 atom = cSim.pObcData[pos];
+        
+        // Get summed Born data
+        for (int i = 0; i < cSim.nonbondOutputBuffers; i++)
+        {
+            sum += *pSt;
+       //     printf("%4d %4d A: %9.4f\n", pos, i, *pSt);
+            pSt += cSim.stride;
+        }
+        
+        
+        // Now calculate Born radius and OBC term.
+        sum                    *= 0.5f * atom.x;
+        float sum2              = sum * sum;
+        float sum3              = sum * sum2;
+        float tanhSum           = tanh(cSim.alphaOBC * sum - cSim.betaOBC * sum2 + cSim.gammaOBC * sum3);
+        float nonOffsetRadii    = atom.x + cSim.dielectricOffset;
+        float bornRadius        = 1.0f / (1.0f / atom.x - tanhSum / nonOffsetRadii); 
+        float obcChain          = atom.x * (cSim.alphaOBC - 2.0f * cSim.betaOBC * sum + 3.0f * cSim.gammaOBC * sum2);
+        obcChain                = (1.0f - tanhSum * tanhSum) * obcChain / nonOffsetRadii;        
+        cSim.pBornRadii[pos] = bornRadius;
+        cSim.pObcChain[pos]  = obcChain;
+        pos += gridDim.x * blockDim.x;
+    }   
+}
+
+void kReduceObcGbsaBornSum(gpuContext gpu)
+{
+//    printf("kReduceObcGbsaBornSum\n");
+    kReduceObcGbsaBornSum_kernel<<<gpu->sim.blocks, 384>>>();
+    gpu->bRecalculateBornRadii = false;
+
+if( 0 ){
+   static int step = 0;
+   int numPrint    = -1;
+   step++;
+   WriteArrayToFile1( gpu, "ObcGbsaBornBRad", step, gpu->psBornRadii, numPrint );
+   WriteArrayToFile1( gpu, "ObcGbsaBornSum", step, gpu->psBornSum, numPrint );
+   WriteArrayToFile2( gpu, "ObcGbsaObcData", step, gpu->psObcData, numPrint );
+   WriteArrayToFile4( gpu, "ObcGbsaBornPos", step, gpu->psPosq4, numPrint );
+   //gpuDumpCoordinates( gpu );
+   gpuDumpObcInfo( gpu );
+}
+    LAUNCHERROR("kReduceObcGbsaBornSum");
+}
+
+
+__global__ void kCalculateObcGbsaBornSum_kernel()
+{
+    // Read queue of work blocks once so the remainder of
+    // kernel can run asynchronously    
+    int pos = (blockIdx.x * cSim.workUnits) / gridDim.x;
+    int end = ((blockIdx.x + 1) * cSim.workUnits) / gridDim.x;
+    if (threadIdx.x < end - pos)
+    {
+        sWorkUnit[threadIdx.x] = cSim.pWorkUnit[pos + threadIdx.x];
+    }
+    if (threadIdx.x < GRID)
+    {
+        sNext[threadIdx.x] = (threadIdx.x - 1) & (GRID - 1);
+    }
+    __syncthreads();
+
+    // Now change pos and end to reflect work queue just read
+    // into shared memory
+    end = end - pos; 
+    pos = end - (threadIdx.x >> GRIDBITS) - 1;
+       
+    while (pos >= 0)
+    {  
+        // Extract cell coordinates from appropriate work unit
+        unsigned int x = sWorkUnit[pos];
+        unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
+        x = (x >> 17) << GRIDBITS;
+        float       dx; 
+        float       dy; 
+        float       dz; 
+        float       r2; 
+        float       r;
+
+        unsigned int tgx = threadIdx.x & (GRID - 1);
+        unsigned int tbx = threadIdx.x - tgx;
+        int tj = tgx; 
+        Atom* psA = &sA[tbx];
+     
+        if (x == y) // Handle diagonals uniquely at 50% efficiency
+        { 
+            // Read fixed atom data into registers and GRF       
+            unsigned int i = x + tgx;
+            float4 apos = cSim.pPosq[i];    // Local atom x, y, z, sum
+            float2 ar = cSim.pObcData[i];   // Local atom vr, sr
+            sA[threadIdx.x].x           = apos.x;
+            sA[threadIdx.x].y           = apos.y;
+            sA[threadIdx.x].z           = apos.z;
+            sA[threadIdx.x].r           = ar.x;
+            sA[threadIdx.x].sr          = ar.y;
+            apos.w                      = 0.0f;
+
+            for (unsigned int j = 0; j < GRID; j++)
+            {
+                dx                      = psA[j].x - apos.x;
+                dy                      = psA[j].y - apos.y;
+                dz                      = psA[j].z - apos.z;
+                r2                      = dx * dx + dy * dy + dz * dz; 
+                r                       = sqrt(r2);
+                float rInverse          = 1.0f / r; 
+                float rScaledRadiusJ    = r + psA[j].sr;
+                if ((j != tgx) && (ar.x < rScaledRadiusJ))
+                {
+                    float l_ij     = 1.0f / max(ar.x, fabs(r - psA[j].sr));
+                    float u_ij     = 1.0f / rScaledRadiusJ;
+                    float l_ij2    = l_ij * l_ij;
+                    float u_ij2    = u_ij * u_ij;
+                    float ratio    = log(u_ij / l_ij);
+                    apos.w        += l_ij - 
+                                     u_ij + 
+                                     0.25f * r * (u_ij2 - l_ij2) + 
+                                     (0.50f * rInverse * ratio) + 
+                                     (0.25f * psA[j].sr * psA[j].sr * rInverse) *
+                                     (l_ij2 - u_ij2);
+                                                                                                              
+                    if (ar.x < (psA[j].r - r))
+                    {
+                        apos.w += 2.0f * ((1.0f / ar.x) - l_ij);
+                    }
+                }
+            }             
+
+            // Write results
+            int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
+            cSim.pBornSum[offset] = apos.w;
+        }         
+        else        // 100% utilization
+        {
+            // Read fixed atom data into registers and GRF
+            int j                           = y + tgx;
+            unsigned int i                  = x + tgx;      
+            
+            float4 temp                     = cSim.pPosq[j];
+            float2 temp1                    = cSim.pObcData[j];
+            float4 apos                     = cSim.pPosq[i];        // Local atom x, y, z, sum
+            float2 ar                       = cSim.pObcData[i];    // Local atom vr, sr
+            sA[threadIdx.x].x               = temp.x;
+            sA[threadIdx.x].y               = temp.y;
+            sA[threadIdx.x].z               = temp.z;
+            sA[threadIdx.x].r               = temp1.x;
+            sA[threadIdx.x].sr              = temp1.y;
+            sA[threadIdx.x].sum = apos.w    = 0.0f;
+
+            for (unsigned int j = 0; j < GRID; j++)
+            {
+                dx                      = psA[tj].x - apos.x; 
+                dy                      = psA[tj].y - apos.y; 
+                dz                      = psA[tj].z - apos.z; 
+                r2                      = dx * dx + dy * dy + dz * dz; 
+                r                       = sqrt(r2);
+                float rInverse          = 1.0f / r; 
+                float rScaledRadiusJ    = r + psA[tj].sr;
+                if (ar.x < rScaledRadiusJ)
+                {
+                    float l_ij     = 1.0f / max(ar.x, fabs(r - psA[tj].sr));
+                    float u_ij     = 1.0f / rScaledRadiusJ;
+                    float l_ij2    = l_ij * l_ij;
+                    float u_ij2    = u_ij * u_ij;
+                    float ratio    = log(u_ij / l_ij);
+                    float term     = l_ij - 
+                                     u_ij + 
+                                     0.25f * r * (u_ij2 - l_ij2) + 
+                                     (0.50f * rInverse * ratio) + 
+                                     (0.25f * psA[tj].sr * psA[tj].sr * rInverse) *
+                                     (l_ij2 - u_ij2);
+                    if (ar.x < (psA[tj].sr - r))
+                    {
+                        term += 2.0f * ((1.0f / ar.x) - l_ij);
+                    }
+                    apos.w        += term;
+                }
+                float rScaledRadiusI    = r + ar.y;
+                if (psA[tj].r < rScaledRadiusI)
+                {
+                    float l_ij     = 1.0f / max(psA[tj].r, fabs(r - ar.y));
+                    float u_ij     = 1.0f / rScaledRadiusI;
+                    float l_ij2    = l_ij * l_ij;
+                    float u_ij2    = u_ij * u_ij;
+                    float ratio    = log(u_ij / l_ij);
+                    float term     = l_ij - 
+                                     u_ij + 
+                                     0.25f * r * (u_ij2 - l_ij2) + 
+                                     (0.50f * rInverse * ratio) + 
+                                     (0.25f * ar.y * ar.y * rInverse) *
+                                     (l_ij2 - u_ij2);
+ 
+                    if (psA[tj].r < (ar.y - r))
+                    {
+                        term += 2.0f * ((1.0f / psA[tj].r) - l_ij);
+                    }
+                    psA[tj].sum    += term;
+                }      
+                tj = sNext[tj];
+            }    
+                
+            // Write results
+            int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
+            cSim.pBornSum[offset] = apos.w;
+            offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
+            cSim.pBornSum[offset] = sA[threadIdx.x].sum;
+        }       
+       
+        pos -= cSim.nonbond_workBlock;     
+    }
+}
+
+void kCalculateObcGbsaBornSum(gpuContext gpu)
+{
+  //  printf("kCalculateObcgbsaBornSum\n");
+    kCalculateObcGbsaBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
+    LAUNCHERROR("kCalculateBornSum");
+}
--- a/platforms/cuda/src/kernels/kCalculateObcGbsaForces1.cu
+++ b/platforms/cuda/src/kernels/kCalculateObcGbsaForces1.cu
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#include <cuda.h>
+#include <vector_functions.h>
+#include <cstdlib>
+#include <string>
+#include <iostream>
+#include <fstream>
+using namespace std;
+
+#include "gputypes.h"
+
+struct Atom {
+    float x;
+    float y;
+    float z;
+    float q;
+    float br;
+    float fx;
+    float fy;
+    float fz;
+    float fb;
+};
+
+__shared__ Atom sA[G8X_NONBOND_THREADS_PER_BLOCK];
+__shared__ unsigned int sWorkUnit[G8X_NONBOND_WORKUNITS_PER_SM];
+__shared__ unsigned int sNext[GRID];
+
+static __constant__ cudaGmxSimulation cSim;
+
+void SetCalculateObcGbsaForces1Sim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
+}
+
+void GetCalculateObcGbsaForces1Sim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
+}
+
+__global__ void kReduceObcGbsaBornForces_kernel()
+{
+
+
+
+    unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
+    while (pos < cSim.atoms)
+    {
+        float bornRadius = cSim.pBornRadii[pos];
+        float obcChain   = cSim.pObcChain[pos];
+        float2 obcData   = cSim.pObcData[pos];
+        float totalForce = 0.0f;
+        float* pFt = cSim.pBornForce + pos;
+        
+        int i = cSim.nonbondOutputBuffers;
+        while (i >= 4)
+        {
+            float f1    = *pFt;
+            pFt        += cSim.stride;
+            float f2    = *pFt;
+            pFt        += cSim.stride;
+            float f3    = *pFt;
+            pFt        += cSim.stride;
+            float f4    = *pFt;
+            pFt        += cSim.stride;
+            totalForce += f1 + f2 + f3 + f4;
+            i -= 4;
+        }
+        if (i >= 2)
+        {
+            float f1    = *pFt;
+            pFt        += cSim.stride;
+            float f2    = *pFt;
+            pFt        += cSim.stride;
+            totalForce += f1 + f2;
+            i -= 2;
+        }
+        if (i > 0)
+        {
+            totalForce += *pFt;
+        }
+        
+// __syncthreads();       
+        //printf("%4d: %9.4f %9.4f %9.4f\n", pos, totalForce, bornRadius, obcChain);
+//totalForce = 0.0f;        
+
+//        if (bornRadius > 0.0f)
+//        {
+            float r            = (obcData.x + cSim.dielectricOffset + cSim.probeRadius);
+            float ratio6       = pow((obcData.x + cSim.dielectricOffset) / bornRadius, 6.0f);
+            //float saTerm       = cSim.surfaceAreaFactor * r * r * ratio6;
+            float saTerm       = cSim.surfaceAreaFactor * r * r * ratio6;
+            totalForce        += saTerm / bornRadius; // 1.102 == Temp mysterious fudge factor, FIX FIX FIX
+//        }
+
+        totalForce *= bornRadius * bornRadius * obcChain;
+        
+        pFt = cSim.pBornForce + pos;
+        *pFt = totalForce;
+        pos += gridDim.x * blockDim.x;
+    }   
+}
+
+__global__ void kReduceObcGbsaBornForces1_kernel()
+{
+
+
+    unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
+    while (pos < cSim.atoms)
+    {
+        float bornRadius = cSim.pBornRadii[pos];
+        float obcChain   = cSim.pObcChain[pos];
+        //float2 obcData   = cSim.pObcData[pos];
+        float totalForce = 0.0f;
+        float* pFt = cSim.pBornForce + pos;
+        
+        int i = cSim.nonbondOutputBuffers;
+        while (i >= 4)
+        {
+            float f1    = *pFt;
+            pFt        += cSim.stride;
+            float f2    = *pFt;
+            pFt        += cSim.stride;
+            float f3    = *pFt;
+            pFt        += cSim.stride;
+            float f4    = *pFt;
+            pFt        += cSim.stride;
+            totalForce += f1 + f2 + f3 + f4;
+            i -= 4;
+        }
+        if (i >= 2)
+        {
+            float f1    = *pFt;
+            pFt        += cSim.stride;
+            float f2    = *pFt;
+            pFt        += cSim.stride;
+            totalForce += f1 + f2;
+            i -= 2;
+        }
+        if (i > 0)
+        {
+            totalForce += *pFt;
+        }
+        
+// __syncthreads();       
+        //printf("%4d: %9.4f %9.4f %9.4f\n", pos, totalForce, bornRadius, obcChain);
+//totalForce = 0.0f;        
+
+/*
+//        if (bornRadius > 0.0f)
+//        {
+            float r            = (obcData.x + cSim.dielectricOffset + cSim.probeRadius);
+            float ratio6       = pow((obcData.x + cSim.dielectricOffset) / bornRadius, 6.0f);
+            float saTerm       = cSim.surfaceAreaFactor * r * r * ratio6;
+            totalForce        += saTerm / bornRadius; // 1.102 == Temp mysterious fudge factor, FIX FIX FIX
+//        }
+
+		  */
+
+        totalForce *= bornRadius * bornRadius * obcChain;
+        
+        cSim.pBornForce[pos] = totalForce;
+        pos += gridDim.x * blockDim.x;
+    }   
+}
+
+
+__global__ void kAceGbsa_kernel()
+{
+
+    unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
+    while (pos < cSim.atoms)
+    {
+        float bornRadius = cSim.pBornRadii[pos];
+        float obcChain   = cSim.pObcChain[pos];
+        float2 obcData   = cSim.pObcData[pos];
+        float totalForce = cSim.pBornForce[pos];
+        //float totalForce = 0.0f;
+        
+        float r            = (obcData.x + cSim.dielectricOffset + cSim.probeRadius);
+		  
+        float ratio6       = pow((obcData.x + cSim.dielectricOffset) / bornRadius, 6.0f);
+		  /*
+        float ratio6       = (obcData.x + cSim.dielectricOffset) / bornRadius;
+		        ratio6       = ratio6*ratio6;
+		        ratio6       = ratio6*ratio6*ratio6;
+	*/	
+	
+        //float saTerm       = 41.84f*cSim.surfaceAreaFactor * r * r * ratio6;
+        float saTerm       = cSim.surfaceAreaFactor * r * r * ratio6;
+        totalForce        += saTerm / bornRadius; // 1.102 == Temp mysterious fudge factor, FIX FIX FIX
+        totalForce        *= bornRadius * bornRadius * obcChain;
+
+        cSim.pBornForce[pos] = totalForce;
+        pos += gridDim.x * blockDim.x;
+    }   
+}
+
+void kReduceObcGbsaBornForces(gpuContext gpu)
+{
+    //printf("kReduceObcGbsaBornForces QQ\n");
+    kReduceObcGbsaBornForces_kernel<<<gpu->sim.blocks, gpu->sim.bf_reduce_threads_per_block>>>();
+    //kReduceObcGbsaBornForces1_kernel<<<gpu->sim.blocks, gpu->sim.bf_reduce_threads_per_block>>>();
+    //kAceGbsa_kernel<<<gpu->sim.blocks, gpu->sim.bf_reduce_threads_per_block>>>();
+    //printf("kReduceObcGbsaBornForces calling gpuDumpObcLoop1 QQ\n");
+	 //gpuDumpObcLoop1(gpu);
+}
+
+
+__global__ void kCalculateObcGbsaForces1_kernel()
+{
+    // Read queue of work blocks once so the remainder of
+    // kernel can run asynchronously    
+    int pos = cSim.nbWorkUnitsPerBlock * blockIdx.x + min(blockIdx.x, cSim.nbWorkUnitsPerBlockRemainder);
+    int end = cSim.nbWorkUnitsPerBlock * (blockIdx.x + 1) + min((blockIdx.x + 1), cSim.nbWorkUnitsPerBlockRemainder);    
+    if (threadIdx.x < end - pos)
+    {
+        sWorkUnit[threadIdx.x] = cSim.pWorkUnit[pos + threadIdx.x];
+    }
+    if (threadIdx.x < GRID)
+    {
+        sNext[threadIdx.x] = (threadIdx.x + 1) & (GRID - 1);
+    }
+    __syncthreads();
+
+    // Now change pos and end to reflect work queue just read
+    // into shared memory
+    end = end - pos; 
+    pos = end - (threadIdx.x >> GRIDBITS) - 1;
+       
+    while (pos >= 0)
+    {  
+    
+        // Extract cell coordinates from appropriate work unit
+        unsigned int x = sWorkUnit[pos];
+        unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
+        x = (x >> 17) << GRIDBITS;
+        float4      apos;   // Local atom x, y, z, q
+        float4      af;     // Local atom fx, fy, fz, fb
+        unsigned int tgx = threadIdx.x & (GRID - 1);
+        unsigned int tbx = threadIdx.x - tgx;
+        int tj = tgx; 
+        Atom* psA = &sA[tbx];
+
+        if (x == y) // Handle diagonals uniquely at 50% efficiency
+        { 
+            // Read fixed atom data into registers and GRF
+            unsigned int i          = x + tgx;
+            apos                    = cSim.pPosq[i];
+            float br                = cSim.pBornRadii[i];
+            sA[threadIdx.x].x       = apos.x;
+            sA[threadIdx.x].y       = apos.y;
+            sA[threadIdx.x].z       = apos.z;
+            sA[threadIdx.x].q       = apos.w;
+            sA[threadIdx.x].br      = br;
+            af.x                    = 0.0f;
+            af.y                    = 0.0f;
+            af.z                    = 0.0f;
+            af.w                    = 0.0f;
+            apos.w                 *= cSim.preFactor;
+            
+            for (unsigned int j = 0; j < GRID; j++)
+            {
+                float dx                = psA[j].x - apos.x; 
+                float dy                = psA[j].y - apos.y; 
+                float dz                = psA[j].z - apos.z; 
+                float r2                = dx * dx + dy * dy + dz * dz; 
+                float alpha2_ij         = br * psA[j].br; 
+                float D_ij              = r2 / (4.0f * alpha2_ij); 
+                float expTerm           = exp(-D_ij); 
+                float denominator2      = r2 + alpha2_ij * expTerm; 
+                float denominator       = sqrt(denominator2); 
+                float Gpol              = (apos.w * psA[j].q) / (denominator * denominator2); 
+                float dGpol_dr          = Gpol * (1.0f - 0.25f * expTerm); 
+                float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij); 
+                dx                     *= dGpol_dr; 
+                dy                     *= dGpol_dr; 
+                dz                     *= dGpol_dr; 
+                af.x                   -= dx; 
+                af.y                   -= dy; 
+                af.z                   -= dz; 
+                af.w                   += dGpol_dalpha2_ij * psA[j].br;      
+            }
+            
+            // Write results
+            int offset                          = x + tgx + (x >> GRIDBITS) * cSim.stride;
+            cSim.pForce4a[offset]               = af;
+            cSim.pBornForce[offset]             = af.w;
+        }         
+        else        // 100% utilization
+        {
+            // Read fixed atom data into registers and GRF
+            int j                   = y + tgx;
+            unsigned int i          = x + tgx;
+            float4 temp             = cSim.pPosq[j];
+            float temp1             = cSim.pBornRadii[j];
+            apos                    = cSim.pPosq[i];
+            float br                = cSim.pBornRadii[i];
+            sA[threadIdx.x].x       = temp.x;
+            sA[threadIdx.x].y       = temp.y;
+            sA[threadIdx.x].z       = temp.z;
+            sA[threadIdx.x].q       = temp.w;
+            sA[threadIdx.x].br      = temp1;
+            sA[threadIdx.x].fx      = af.x = 0.0f;
+            sA[threadIdx.x].fy      = af.y = 0.0f;
+            sA[threadIdx.x].fz      = af.z = 0.0f;
+            sA[threadIdx.x].fb      = af.w = 0.0f;
+            apos.w                 *= cSim.preFactor;
+
+            for (j = 0; j < GRID; j++)
+            {   
+                float dx                = psA[tj].x - apos.x; 
+                float dy                = psA[tj].y - apos.y; 
+                float dz                = psA[tj].z - apos.z; 
+                float r2                = dx * dx + dy * dy + dz * dz; 
+                float alpha2_ij         = br * psA[tj].br; 
+                float D_ij              = r2 / (4.0f * alpha2_ij); 
+                float expTerm           = exp(-D_ij); 
+                float denominator2      = r2 + alpha2_ij * expTerm; 
+                float denominator       = sqrt(denominator2); 
+                float Gpol              = (apos.w * psA[tj].q) / (denominator * denominator2); 
+                float dGpol_dr          = Gpol * (1.0f - 0.25f * expTerm); 
+                float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij); 
+                dx                     *= dGpol_dr; 
+                dy                     *= dGpol_dr; 
+                dz                     *= dGpol_dr; 
+                af.x                   -= dx; 
+                af.y                   -= dy; 
+                af.z                   -= dz; 
+                psA[tj].fx             += dx; 
+                psA[tj].fy             += dy; 
+                psA[tj].fz             += dz; 
+                af.w                   += dGpol_dalpha2_ij * psA[tj].br; 
+                psA[tj].fb             += dGpol_dalpha2_ij * br;        
+                tj                      = sNext[tj]; 
+            }
+           
+            // Write results
+            int offset                          = x + tgx + (y >> GRIDBITS) * cSim.stride;
+            cSim.pForce4a[offset]               = af;
+            cSim.pBornForce[offset]             = af.w;
+            offset                              = y + tgx + (x >> GRIDBITS) * cSim.stride;
+            af.x                                = sA[threadIdx.x].fx;
+            af.y                                = sA[threadIdx.x].fy;
+            af.z                                = sA[threadIdx.x].fz;
+            af.w                                = sA[threadIdx.x].fb;
+            offset                              = y + tgx + (x >> GRIDBITS) * cSim.stride;
+            cSim.pForce4a[offset]               = af;
+            cSim.pBornForce[offset]             = af.w;
+        }
+        pos -= cSim.nonbond_workBlock;     
+    }
+}
+
+__global__ extern void kCalculateObcGbsaForces1_12_kernel();
+
+void kCalculateObcGbsaForces1(gpuContext gpu)
+{
+    //printf("kCalculateObcGbsaForces1 version=%d sm_12=%d QQ\n", gpu->sm_version, SM_12);
+    if (gpu->sm_version < SM_12)
+        kCalculateObcGbsaForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
+    else
+        kCalculateObcGbsaForces1_12_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
+    LAUNCHERROR("kCalculateObcGbsaForce1");
+}
--- a/platforms/cuda/src/kernels/kCalculateObcGbsaForces1_12.cu
+++ b/platforms/cuda/src/kernels/kCalculateObcGbsaForces1_12.cu
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#include <cuda.h>
+#include <vector_functions.h>
+#include <cstdlib>
+#include <string>
+#include <iostream>
+#include <fstream>
+using namespace std;
+
+#include "gputypes.h"
+
+struct Atom {
+    float x;
+    float y;
+    float z;
+    float q;
+    float br;
+    float fx;
+    float fy;
+    float fz;
+    float fb;
+};
+
+__shared__ Atom sA[GT2XX_NONBOND_THREADS_PER_BLOCK];
+__shared__ unsigned int sWorkUnit[GT2XX_NONBOND_WORKUNITS_PER_SM];
+__shared__ unsigned int sNext[GRID];
+
+static __constant__ cudaGmxSimulation cSim;
+
+void SetCalculateObcGbsaForces1_12Sim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
+}
+
+void GetCalculateObcGbsaForces1_12Sim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
+}
+
+__global__ void kCalculateObcGbsaForces1_12_kernel()
+{
+    // Read queue of work blocks once so the remainder of
+    // kernel can run asynchronously    
+    int pos = cSim.nbWorkUnitsPerBlock * blockIdx.x + min(blockIdx.x, cSim.nbWorkUnitsPerBlockRemainder);
+    int end = cSim.nbWorkUnitsPerBlock * (blockIdx.x + 1) + min((blockIdx.x + 1), cSim.nbWorkUnitsPerBlockRemainder);    
+    if (threadIdx.x < end - pos)
+    {
+        sWorkUnit[threadIdx.x] = cSim.pWorkUnit[pos + threadIdx.x];
+    }
+    if (threadIdx.x < GRID)
+    {
+        sNext[threadIdx.x] = (threadIdx.x + 1) & (GRID - 1);
+    }
+    __syncthreads();
+
+    // Now change pos and end to reflect work queue just read
+    // into shared memory
+    end = end - pos; 
+    pos = end - (threadIdx.x >> GRIDBITS) - 1;
+       
+    while (pos >= 0)
+    {  
+    
+        // Extract cell coordinates from appropriate work unit
+        unsigned int x = sWorkUnit[pos];
+        unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
+        x = (x >> 17) << GRIDBITS;
+        float4      apos;   // Local atom x, y, z, q
+        float4      af;     // Local atom fx, fy, fz, fb
+        unsigned int tgx = threadIdx.x & (GRID - 1);
+        unsigned int tbx = threadIdx.x - tgx;
+        int tj = tgx; 
+        Atom* psA = &sA[tbx];
+
+        if (x == y) // Handle diagonals uniquely at 50% efficiency
+        { 
+            // Read fixed atom data into registers and GRF
+            unsigned int i          = x + tgx;
+            apos                    = cSim.pPosq[i];
+            float br                = cSim.pBornRadii[i];
+            sA[threadIdx.x].x       = apos.x;
+            sA[threadIdx.x].y       = apos.y;
+            sA[threadIdx.x].z       = apos.z;
+            sA[threadIdx.x].q       = apos.w;
+            sA[threadIdx.x].br      = br;
+            af.x                    = 0.0f;
+            af.y                    = 0.0f;
+            af.z                    = 0.0f;
+            af.w                    = 0.0f;
+            apos.w                 *= cSim.preFactor;
+            
+            for (unsigned int j = 0; j < GRID; j++)
+            {
+                float dx                = psA[j].x - apos.x; 
+                float dy                = psA[j].y - apos.y; 
+                float dz                = psA[j].z - apos.z; 
+                float r2                = dx * dx + dy * dy + dz * dz; 
+                float alpha2_ij         = br * psA[j].br; 
+                float D_ij              = r2 / (4.0f * alpha2_ij); 
+                float expTerm           = exp(-D_ij); 
+                float denominator2      = r2 + alpha2_ij * expTerm; 
+                float denominator       = sqrt(denominator2); 
+                float Gpol              = (apos.w * psA[j].q) / (denominator * denominator2); 
+                float dGpol_dr          = Gpol * (1.0f - 0.25f * expTerm); 
+                float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij); 
+                dx                     *= dGpol_dr; 
+                dy                     *= dGpol_dr; 
+                dz                     *= dGpol_dr; 
+                af.x                   -= dx; 
+                af.y                   -= dy; 
+                af.z                   -= dz; 
+                af.w                   += dGpol_dalpha2_ij * psA[j].br;      
+            }
+            
+            // Write results
+            int offset                          = x + tgx + (x >> GRIDBITS) * cSim.stride;
+            cSim.pForce4a[offset]               = af;
+            cSim.pBornForce[offset]             = af.w;
+        }         
+        else        // 100% utilization
+        {
+            // Read fixed atom data into registers and GRF
+            int j                   = y + tgx;
+            unsigned int i          = x + tgx;
+            float4 temp             = cSim.pPosq[j];
+            float temp1             = cSim.pBornRadii[j];
+            apos                    = cSim.pPosq[i];
+            float br                = cSim.pBornRadii[i];
+            sA[threadIdx.x].x       = temp.x;
+            sA[threadIdx.x].y       = temp.y;
+            sA[threadIdx.x].z       = temp.z;
+            sA[threadIdx.x].q       = temp.w;
+            sA[threadIdx.x].br      = temp1;
+            sA[threadIdx.x].fx      = af.x = 0.0f;
+            sA[threadIdx.x].fy      = af.y = 0.0f;
+            sA[threadIdx.x].fz      = af.z = 0.0f;
+            sA[threadIdx.x].fb      = af.w = 0.0f;
+            apos.w                 *= cSim.preFactor;
+
+            for (j = 0; j < GRID; j++)
+            {   
+                float dx                = psA[tj].x - apos.x; 
+                float dy                = psA[tj].y - apos.y; 
+                float dz                = psA[tj].z - apos.z; 
+                float r2                = dx * dx + dy * dy + dz * dz; 
+                float alpha2_ij         = br * psA[tj].br; 
+                float D_ij              = r2 / (4.0f * alpha2_ij); 
+                float expTerm           = exp(-D_ij); 
+                float denominator2      = r2 + alpha2_ij * expTerm; 
+                float denominator       = sqrt(denominator2); 
+                float Gpol              = (apos.w * psA[tj].q) / (denominator * denominator2); 
+                float dGpol_dr          = Gpol * (1.0f - 0.25f * expTerm); 
+                float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij); 
+                dx                     *= dGpol_dr; 
+                dy                     *= dGpol_dr; 
+                dz                     *= dGpol_dr; 
+                af.x                   -= dx; 
+                af.y                   -= dy; 
+                af.z                   -= dz; 
+                psA[tj].fx             += dx; 
+                psA[tj].fy             += dy; 
+                psA[tj].fz             += dz; 
+                af.w                   += dGpol_dalpha2_ij * psA[tj].br; 
+                psA[tj].fb             += dGpol_dalpha2_ij * br;        
+                tj                      = sNext[tj]; 
+            }
+           
+            // Write results
+            int offset                          = x + tgx + (y >> GRIDBITS) * cSim.stride;
+            cSim.pForce4a[offset]               = af;
+            cSim.pBornForce[offset]             = af.w;
+            offset                              = y + tgx + (x >> GRIDBITS) * cSim.stride;
+            af.x                                = sA[threadIdx.x].fx;
+            af.y                                = sA[threadIdx.x].fy;
+            af.z                                = sA[threadIdx.x].fz;
+            af.w                                = sA[threadIdx.x].fb;
+            offset                              = y + tgx + (x >> GRIDBITS) * cSim.stride;
+            cSim.pForce4a[offset]               = af;
+            cSim.pBornForce[offset]             = af.w;
+        }
+        pos -= cSim.nonbond_workBlock;     
+    }
+}
+
+void kCalculateObcGbsaForces1_12(gpuContext gpu)
+{
+  //  printf("kCalculateObcGbsaForces1_12\n");
+    kCalculateObcGbsaForces1_12_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
+    LAUNCHERROR("kCalculateObcGbsaForce1_12");
+}
--- a/platforms/cuda/src/kernels/kCalculateObcGbsaForces2.cu
+++ b/platforms/cuda/src/kernels/kCalculateObcGbsaForces2.cu
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#include <cuda.h>
+#include <vector_functions.h>
+#include <cstdlib>
+#include <string>
+#include <iostream>
+#include <fstream>
+using namespace std;
+
+#include "gputypes.h"
+#include "cudaKernels.h"
+
+struct Atom {
+    float x;
+    float y;
+    float z;
+    float r;
+    float sr;
+    float sr2;
+    float fx;
+    float fy;
+    float fz;
+    float fb;
+//    float sum;
+//    float oneOverR;
+    int pos;
+    int wx;
+    int wy;
+};
+
+
+__shared__ Atom sA[G8X_BORNFORCE2_THREADS_PER_BLOCK];
+__shared__ unsigned int sWorkUnit[G8X_NONBOND_WORKUNITS_PER_SM];
+__shared__ unsigned int sNext[GRID];
+
+static __constant__ cudaGmxSimulation cSim;
+
+void SetCalculateObcGbsaForces2Sim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
+}
+
+void GetCalculateObcGbsaForces2Sim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
+}
+
+__global__ void kCalculateObcGbsaForces2_kernel()
+{
+    // Read queue of work blocks once so the remainder of
+    // kernel can run asynchronously    
+    int pos = cSim.bf2WorkUnitsPerBlock * blockIdx.x + min(blockIdx.x, cSim.bf2WorkUnitsPerBlockRemainder);
+    int end = cSim.bf2WorkUnitsPerBlock * (blockIdx.x + 1) + min((blockIdx.x + 1), cSim.bf2WorkUnitsPerBlockRemainder);    
+    if (threadIdx.x < end - pos)
+    {
+        sWorkUnit[threadIdx.x] = cSim.pWorkUnit[pos + threadIdx.x];
+    }
+    if (threadIdx.x < GRID)
+    {
+        sNext[threadIdx.x] = (threadIdx.x + 1) & (GRID - 1);
+    }
+    __syncthreads();
+
+    // Now change pos and end to reflect work queue just read
+    // into shared memory
+    end = end - pos; 
+    sA[threadIdx.x].pos = end - (threadIdx.x >> GRIDBITS) - 1;
+       
+    while (sA[threadIdx.x].pos >= 0)
+    {  
+    
+        // Extract cell coordinates from appropriate work unit
+        unsigned int x = sWorkUnit[sA[threadIdx.x].pos];
+        unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
+        x = (x >> 17) << GRIDBITS;
+        unsigned int tgx                = threadIdx.x & (GRID - 1);
+        unsigned int i                  = x + tgx;
+        float4 apos                     = cSim.pPosq[i];
+        float2 a                        = cSim.pObcData[i];
+        float fb                        = cSim.pBornForce[i];
+        unsigned int tbx                = threadIdx.x - tgx;
+        int tj                          = tgx; 
+        Atom* psA                       = &sA[tbx];
+        sA[threadIdx.x].wx              = x;
+        sA[threadIdx.x].wy              = y; 
+        if (x == y) // Handle diagonals uniquely at 50% efficiency
+        { 
+            // Read fixed atom data into registers and GRF
+            float3 af;
+            sA[threadIdx.x].fx = af.x   = 0.0f;
+            sA[threadIdx.x].fy = af.y   = 0.0f;
+            sA[threadIdx.x].fz = af.z   = 0.0f;
+//            float sum                   = 0.0f;
+            sA[threadIdx.x].x           = apos.x;
+            sA[threadIdx.x].y           = apos.y;
+            sA[threadIdx.x].z           = apos.z;
+//            float oneOverR              = 1.0f / a.x;
+            sA[threadIdx.x].r           = a.x;
+            sA[threadIdx.x].sr          = a.y;
+            sA[threadIdx.x].sr2         = a.y * a.y;
+            sA[threadIdx.x].fb          = fb;
+
+            for (unsigned int j = sNext[tgx]; j != tgx; j = sNext[j])
+            {
+                float dx                = psA[j].x - apos.x; 
+                float dy                = psA[j].y - apos.y; 
+                float dz                = psA[j].z - apos.z; 
+                float r2                = dx * dx + dy * dy + dz * dz;
+                float r                 = sqrt(r2);
+                
+                // Atom I Born forces and sum
+                float rScaledRadiusJ    = r + psA[j].sr; 
+                float l_ij              = 1.0f / max(a.x, fabs(r - psA[j].sr));
+                float u_ij              = 1.0f / rScaledRadiusJ;
+                float rInverse          = 1.0f / r;
+                float l_ij2             = l_ij * l_ij;
+                float u_ij2             = u_ij * u_ij; 
+                float r2Inverse         = rInverse * rInverse;                   
+                float t1                = log (u_ij / l_ij);
+                float t2                = (l_ij2 - u_ij2);
+                float t3                = t2 * rInverse;
+                t1                     *= rInverse;
+                
+                // Born Forces term
+                float term              =  0.125f * 
+                                          (1.000f + psA[j].sr2 * r2Inverse) * t3 + 
+                                           0.250f * t1 * r2Inverse;
+                float dE                = fb * term;
+                
+                // Born sum term
+//                term                    =   l_ij - u_ij  +
+//                                           -0.25f * r * t2 +
+//                                            0.50f * t1 +
+//                                           (0.25f * psA[j].sr2) * t3;
+//                if (a.x < (psA[j].sr - r))
+//                {
+//                    term               += 2.0f * (oneOverR - l_ij);
+//                }
+                if (a.x >= rScaledRadiusJ)
+                {
+                    dE                  = /*term =*/ 0.0f;
+                }
+                
+                float d                 = dx * dE;
+                af.x                   -= d;
+                psA[j].fx              += d;
+                d                       = dy * dE;  
+                af.y                   -= d;
+                psA[j].fy              += d;
+                d                       = dz * dE;
+                af.z                   -= d;
+                psA[j].fz              += d; 
+//                sum                    += term;
+            }
+            
+            // Write results
+            int offset                  = x + tgx + (x >> GRIDBITS) * cSim.stride;
+            float4 of;
+            of.x                        = af.x + sA[threadIdx.x].fx;
+            of.y                        = af.y + sA[threadIdx.x].fy;
+            of.z                        = af.z + sA[threadIdx.x].fz;
+            of.w                        = 0.0f;
+            cSim.pForce4b[offset]       = of;
+//            cSim.pBornSum[offset]       = sum;
+        }         
+        else 
+        {        
+            // Read fixed atom data into registers and GRF
+            int j                       = y + tgx;
+            float4 temp                 = cSim.pPosq[j];
+            float2 temp1                = cSim.pObcData[j];
+            sA[threadIdx.x].fb          = cSim.pBornForce[j];
+            float3 af;
+            sA[threadIdx.x].fx = af.x   = 0.0f;
+            sA[threadIdx.x].fy = af.y   = 0.0f;
+            sA[threadIdx.x].fz = af.z   = 0.0f;
+//            sA[threadIdx.x].sum         = 0.0f;
+//            float sum                   = 0.0f;
+            float sr2                   = a.y * a.y;
+            sA[threadIdx.x].x           = temp.x;
+            sA[threadIdx.x].y           = temp.y;
+            sA[threadIdx.x].z           = temp.z;
+            sA[threadIdx.x].r           = temp1.x;
+            sA[threadIdx.x].sr          = temp1.y;
+            sA[threadIdx.x].sr2         = temp1.y * temp1.y;
+//            sA[threadIdx.x].oneOverR    = 1.0f / temp1.x;
+
+            for (j = 0; j < GRID; j++)
+            {
+                float dx                = psA[tj].x - apos.x; 
+                float dy                = psA[tj].y - apos.y; 
+                float dz                = psA[tj].z - apos.z; 
+                float r2                = dx * dx + dy * dy + dz * dz; 
+                float r                 = sqrt(r2);
+                
+                // Atom I Born Forces and sum
+                float r2Inverse         = 1.0f / r2;
+                float rScaledRadiusJ    = r + psA[tj].sr;
+                float rInverse          = 1.0f / r;
+                
+                
+                float l_ij              = 1.0f / max(a.x, fabs(r - psA[tj].sr));
+                float u_ij              = 1.0f / rScaledRadiusJ;
+                float l_ij2             = l_ij * l_ij;
+                float u_ij2             = u_ij * u_ij;
+                float t1                = log (u_ij / l_ij);
+                float t2                = (l_ij2 - u_ij2);
+                float t3                = t2 * rInverse;
+                t1                     *= rInverse;
+                   
+                // Born Forces term
+                float term              =  0.125f * 
+                                          (1.000f + psA[tj].sr2 * r2Inverse) * t3 + 
+                                           0.250f * t1 * r2Inverse;
+                float dE                = fb * term;
+                 // Born sum term
+//                term                    =   l_ij - u_ij  +
+//                                           -0.25f * r * t2 +
+//                                            0.50f * t1 +
+//                                           (0.25f * psA[tj].sr2) * t3;
+//                if (a.x < (psA[tj].sr - r))
+//                {
+//                    term               += 2.0f * ((1.0f / a.x) - l_ij);
+//                }
+                if (a.x >= rScaledRadiusJ) 
+                {
+                    dE                  = /*term =*/ 0.0f;
+                }
+                
+                
+                float d                 = dx * dE;
+                af.x                   -= d;
+                psA[tj].fx             += d;
+                d                       = dy * dE;  
+                af.y                   -= d;
+                psA[tj].fy             += d;
+                d                       = dz * dE;
+                af.z                   -= d;
+                psA[tj].fz             += d;    
+//                sum                    += term;
+                
+                // Atom J Born Forces and sum               
+                float rScaledRadiusI    = r + a.y;
+                l_ij                    = 1.0f / max(psA[tj].r, fabs(r - a.y));
+                u_ij                    = 1.0f / rScaledRadiusI;
+                l_ij2                   = l_ij * l_ij;
+                u_ij2                   = u_ij * u_ij;
+                t1                      = log (u_ij / l_ij);
+                t2                      = (l_ij2 - u_ij2);
+                t3                      = t2 * rInverse;
+                t1                     *= rInverse;
+                  
+                // Born Forces term
+                term                    =  0.125f * 
+                                          (1.000f + sr2 * r2Inverse) * t3 + 
+                                           0.250f * t1 * r2Inverse;
+                dE                      = psA[tj].fb * term;  
+                
+                // Born sum term
+//                term                    =   l_ij - u_ij  +
+//                                           -0.25f * r * t2 +
+//                                            0.50f * t1 +
+//                                           (0.25f * sr2) * t3;
+//
+//                if (psA[tj].r < (a.y - r))
+//                {
+//                    term               +=  2.0f * (psA[tj].oneOverR - l_ij);
+//                }
+                if (psA[tj].r >= rScaledRadiusI) 
+                {
+                    dE                  = /*term =*/ 0.0f;
+                }                                    
+                dx                     *= dE;
+                dy                     *= dE;
+                dz                     *= dE;
+                psA[tj].fx             += dx; 
+                psA[tj].fy             += dy;
+                psA[tj].fz             += dz; 
+                af.x                   -= dx;
+                af.y                   -= dy;
+                af.z                   -= dz;
+//                psA[tj].sum            +=  term;
+                                       
+                tj                      = sNext[tj]; 
+            }
+                
+            // Write results
+            int offset                  = sA[threadIdx.x].wx + tgx + (sA[threadIdx.x].wy >> GRIDBITS) * cSim.stride;
+            float4 of;
+            of.x                        = af.x;
+            of.y                        = af.y;
+            of.z                        = af.z;
+            of.w                        = 0.0f;
+            cSim.pForce4b[offset]       = of;
+//            cSim.pBornSum[offset]       = sum;
+            offset                      = sA[threadIdx.x].wy + tgx + (sA[threadIdx.x].wx >> GRIDBITS) * cSim.stride;
+            of.x                        = sA[threadIdx.x].fx;
+            of.y                        = sA[threadIdx.x].fy;
+            of.z                        = sA[threadIdx.x].fz;
+            cSim.pForce4b[offset]       = of;
+//            cSim.pBornSum[offset]       = sA[threadIdx.x].sum;
+        }
+        sA[threadIdx.x].pos            -= cSim.bornForce2_workBlock;     
+    }
+}
+
+__global__ extern void kCalculateObcGbsaForces2_12_kernel();
+
+void kCalculateObcGbsaForces2(gpuContext gpu)
+{
+    //printf("kCalculateObcGbsaForces2\n");
+    if (gpu->sm_version < SM_12)
+        kCalculateObcGbsaForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block>>>();
+    else
+        kCalculateObcGbsaForces2_12_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block>>>();
+if( 0 ){
+   static int step = 0;
+	//int numPrint    = -1;
+	step++;
+	//WriteArrayToFile1( gpu, "ObcGbsaBornBRad", step, gpu->psBornRadii, numPrint );
+	//gpuDumpCoordinates( gpu );
+	kReduceBornSumAndForces( gpu );
+	gpuDumpObcLoop1( gpu );
+}
+
+    LAUNCHERROR("kCalculateObcGbsaForces2");
+}
--- a/platforms/cuda/src/kernels/kCalculateObcGbsaForces2_12.cu
+++ b/platforms/cuda/src/kernels/kCalculateObcGbsaForces2_12.cu
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#include <cuda.h>
+#include <vector_functions.h>
+#include <cstdlib>
+#include <string>
+#include <iostream>
+#include <fstream>
+using namespace std;
+
+#include "gputypes.h"
+
+struct Atom {
+    float x;
+    float y;
+    float z;
+    float r;
+    float sr;
+    float sr2;
+    float fx;
+    float fy;
+    float fz;
+    float fb;
+//    float sum;
+};
+
+
+__shared__ Atom sA[GT2XX_BORNFORCE2_THREADS_PER_BLOCK];
+__shared__ unsigned int sWorkUnit[GT2XX_NONBOND_WORKUNITS_PER_SM];
+__shared__ unsigned int sNext[GRID];
+
+static __constant__ cudaGmxSimulation cSim;
+
+void SetCalculateObcGbsaForces2_12Sim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
+}
+
+void GetCalculateObcGbsaForces2_12Sim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
+}
+
+__global__ void kCalculateObcGbsaForces2_12_kernel()
+{
+    // Read queue of work blocks once so the remainder of
+    // kernel can run asynchronously    
+    int pos = cSim.bf2WorkUnitsPerBlock * blockIdx.x + min(blockIdx.x, cSim.bf2WorkUnitsPerBlockRemainder);
+    int end = cSim.bf2WorkUnitsPerBlock * (blockIdx.x + 1) + min((blockIdx.x + 1), cSim.bf2WorkUnitsPerBlockRemainder);    
+    if (threadIdx.x < end - pos)
+    {
+        sWorkUnit[threadIdx.x]          = cSim.pWorkUnit[pos + threadIdx.x];
+    }
+    if (threadIdx.x < GRID)
+    {
+        sNext[threadIdx.x]              = (threadIdx.x + 1) & (GRID - 1);
+    }
+    __syncthreads();
+
+    // Now change pos and end to reflect work queue just read
+    // into shared memory
+    end                                 = end - pos; 
+    pos                                 = end - (threadIdx.x >> GRIDBITS) - 1;
+       
+    while (pos >= 0)
+    {  
+    
+        // Extract cell coordinates from appropriate work unit
+        unsigned int x                  = sWorkUnit[pos];
+        unsigned int y                  = ((x >> 2) & 0x7fff) << GRIDBITS;
+        x                               = (x >> 17) << GRIDBITS;
+        unsigned int tgx                = threadIdx.x & (GRID - 1);
+        unsigned int i                  = x + tgx;
+        float4 apos                     = cSim.pPosq[i];
+        float2 a                        = cSim.pObcData[i];
+        float fb                        = cSim.pBornForce[i];
+        unsigned int tbx                = threadIdx.x - tgx;
+        int tj                          = tgx; 
+        Atom* psA                       = &sA[tbx];
+        if (x == y) // Handle diagonals uniquely at 50% efficiency
+        { 
+            // Read fixed atom data into registers and GRF
+            float3 af;
+            sA[threadIdx.x].fx = af.x   = 0.0f;
+            sA[threadIdx.x].fy = af.y   = 0.0f;
+            sA[threadIdx.x].fz = af.z   = 0.0f;
+//            float sum                   = 0.0f;
+            sA[threadIdx.x].x           = apos.x;
+            sA[threadIdx.x].y           = apos.y;
+            sA[threadIdx.x].z           = apos.z;
+//            float oneOverR              = 1.0f / a.x;
+            sA[threadIdx.x].r           = a.x;
+            sA[threadIdx.x].sr          = a.y;
+            sA[threadIdx.x].sr2         = a.y * a.y;
+            sA[threadIdx.x].fb          = fb;
+            
+            for (unsigned int j = sNext[tgx]; j != tgx; j = sNext[j])
+            {
+                float dx                = psA[j].x - apos.x; 
+                float dy                = psA[j].y - apos.y; 
+                float dz                = psA[j].z - apos.z; 
+                float r2                = dx * dx + dy * dy + dz * dz;
+                float r                 = sqrt(r2);
+                
+                
+                // Atom I Born forces and sum
+                float rScaledRadiusJ    = r + psA[j].sr;
+                
+                float l_ij          = 1.0f / max(a.x, fabs(r - psA[j].sr));
+                float u_ij          = 1.0f / rScaledRadiusJ;
+                float rInverse      = 1.0f / r;
+                float l_ij2         = l_ij * l_ij;
+                float u_ij2         = u_ij * u_ij;
+                float r2Inverse     = rInverse * rInverse;
+                float t1            = log (u_ij / l_ij);
+                float t2            = (l_ij2 - u_ij2);
+                float t3            = t2 * rInverse;
+                t1                 *= rInverse;
+                    
+                // Born Forces term
+                float term          =  0.125f * 
+                                      (1.000f + psA[j].sr2 * r2Inverse) * t3 + 
+                                       0.250f * t1 * r2Inverse;
+                float dE            = fb * term;
+                    
+                // Born sum term
+//                term                =  l_ij - u_ij  +
+//                                      -0.25f * r * t2 +
+//                                       0.50f * t1 +
+//                                      (0.25f * psA[j].sr2) * t3;
+//                if (a.x < (psA[j].sr - r))
+//                {
+//                    term           += 2.0f * (oneOverR - l_ij);
+//                }
+                    
+                if (a.x >= rScaledRadiusJ) 
+                {
+                    dE              = /*term =*/ 0.0f;
+                }
+                float d             = dx * dE;
+                af.x               -= d;
+                psA[j].fx          += d;
+                d                   = dy * dE;  
+                af.y               -= d;
+                psA[j].fy          += d;
+                d                   = dz * dE;
+                af.z               -= d;
+                psA[j].fz          += d;                                          
+//                sum                += term;
+            }
+            
+            // Write results
+            int offset                  = x + tgx + (x >> GRIDBITS) * cSim.stride;
+            float4 of;
+            of.x                        = af.x + sA[threadIdx.x].fx;
+            of.y                        = af.y + sA[threadIdx.x].fy;
+            of.z                        = af.z + sA[threadIdx.x].fz;
+            of.w                        = 0.0f;
+            cSim.pForce4b[offset]       = of;
+//            cSim.pBornSum[offset]       = sum;
+        }         
+        else 
+        {        
+            // Read fixed atom data into registers and GRF
+            int j                       = y + tgx;
+            float4 temp                 = cSim.pPosq[j];
+            float2 temp1                = cSim.pObcData[j];
+            sA[threadIdx.x].fb          = cSim.pBornForce[j];
+            float3 af;
+            sA[threadIdx.x].fx = af.x   = 0.0f;
+            sA[threadIdx.x].fy = af.y   = 0.0f;
+            sA[threadIdx.x].fz = af.z   = 0.0f;
+//            sA[threadIdx.x].sum         = 0.0f;
+//            float sum                   = 0.0f;
+            float sr2                   = a.y * a.y;
+//            float oneOverR              = 1.0f / a.x;
+            sA[threadIdx.x].x           = temp.x;
+            sA[threadIdx.x].y           = temp.y;
+            sA[threadIdx.x].z           = temp.z;
+            sA[threadIdx.x].r           = temp1.x;
+            sA[threadIdx.x].sr          = temp1.y;
+            sA[threadIdx.x].sr2         = temp1.y * temp1.y;
+            for (j = 0; j < GRID; j++)
+            {
+                float dx                = psA[tj].x - apos.x; 
+                float dy                = psA[tj].y - apos.y; 
+                float dz                = psA[tj].z - apos.z; 
+                float r2                = dx * dx + dy * dy + dz * dz; 
+                float r                 = sqrt(r2);
+                
+                // Interleaved Atom I and J Born Forces and sum components
+                float r2Inverse         = 1.0f / r2;
+                float rScaledRadiusJ    = r + psA[tj].sr;
+                float rScaledRadiusI    = r + a.y;
+                float rInverse          = 1.0f / r;
+                float l_ijJ             = 1.0f / max(a.x, fabs(r - psA[tj].sr));
+                float l_ijI             = 1.0f / max(psA[tj].r, fabs(r - a.y));
+                float u_ijJ             = 1.0f / rScaledRadiusJ;
+                float u_ijI             = 1.0f / rScaledRadiusI;
+                float l_ij2J            = l_ijJ * l_ijJ;
+                float l_ij2I            = l_ijI * l_ijI;
+                float u_ij2J            = u_ijJ * u_ijJ;
+                float u_ij2I            = u_ijI * u_ijI;
+                float t1J               = log (u_ijJ / l_ijJ);
+                float t1I               = log (u_ijI / l_ijI);
+                float t2J               = (l_ij2J - u_ij2J);
+                float t2I               = (l_ij2I - u_ij2I);
+                float t3J               = t2J * rInverse;
+                float t3I               = t2I * rInverse;
+                t1J                    *= rInverse;
+                t1I                    *= rInverse;
+                   
+                // Born Forces term
+                float term              =  0.125f * 
+                                          (1.000f + psA[tj].sr2 * r2Inverse) * t3J + 
+                                           0.250f * t1J * r2Inverse;
+                float dE                = fb * term;
+                    
+                // Atom I Born sum term
+//                term                    =   l_ijJ - u_ijJ +
+//                                           -0.25f * r * t2J +
+//                                            0.50f * t1J +
+//                                           (0.25f * psA[tj].sr2) * t3J;
+//                if (a.x < (psA[tj].sr - r))
+//                {
+//                    term               += 2.0f * (oneOverR - l_ijJ);
+//                }
+                
+                if (a.x >= rScaledRadiusJ) 
+                {
+                    dE                  = /*term =*/ 0.0f;
+                }
+                
+                float d                 = dx * dE;
+                af.x                   -= d;
+                psA[tj].fx             += d;
+                d                       = dy * dE;  
+                af.y                   -= d;
+                psA[tj].fy             += d;
+                d                       = dz * dE;
+                af.z                   -= d;
+                psA[tj].fz             += d;                                          
+//                sum                    += term;
+               
+                // Atom J Born sum term               
+                term                    =  0.125f * 
+                                          (1.000f + sr2 * r2Inverse) * t3I + 
+                                           0.250f * t1I * r2Inverse;
+                dE                      = psA[tj].fb * term;  
+                
+//                term                    =  l_ijI - u_ijI +
+//                                          -0.25f * r * t2I +
+//                                           0.50f * t1I +
+//                                          (0.25f * sr2) * t3I;
+//                if (psA[tj].r < (a.y - r))
+//                {
+//                    term               += 2.0f * ((1.0f / psA[tj].r) - l_ijI);
+//                }
+                
+                if (psA[tj].r >= rScaledRadiusI) 
+                {           
+                    dE                  = /*term =*/ 0.0f;
+                }                             
+                dx                     *= dE;
+                dy                     *= dE;
+                dz                     *= dE;
+                psA[tj].fx             += dx; 
+                psA[tj].fy             += dy;
+                psA[tj].fz             += dz; 
+                af.x                   -= dx;
+                af.y                   -= dy;
+                af.z                   -= dz;    
+//                psA[tj].sum            += term;
+                tj                      = sNext[tj]; 
+            }
+                
+            // Write results
+            int offset                  = x + tgx + (y >> GRIDBITS) * cSim.stride;
+            float4 of;
+            of.x                        = af.x;
+            of.y                        = af.y;
+            of.z                        = af.z;
+            of.w                        = 0.0f;
+            cSim.pForce4b[offset]       = of;
+//            cSim.pBornSum[offset]       = sum;
+            offset                      = y + tgx + (x >> GRIDBITS) * cSim.stride;
+            of.x                        = sA[threadIdx.x].fx;
+            of.y                        = sA[threadIdx.x].fy;
+            of.z                        = sA[threadIdx.x].fz;
+            cSim.pForce4b[offset]       = of;
+//            cSim.pBornSum[offset]       = sA[threadIdx.x].sum;
+        }
+        pos                            -= cSim.bornForce2_workBlock;     
+    }
+}
+
+void kCalculateObcGbsaForces2_12(gpuContext gpu)
+{
+  //  printf("kCalculateObcGbsaForces2_12\n");
+    kCalculateObcGbsaForces2_12_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block>>>();
+    LAUNCHERROR("kCalculateObcGbsaForces2_12");
+}
--- a/platforms/cuda/src/kernels/kForces.cu
+++ b/platforms/cuda/src/kernels/kForces.cu
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#include <cuda.h>
+#include <vector_functions.h>
+#include <cstdlib>
+#include <string>
+#include <iostream>
+#include <fstream>
+using namespace std;
+
+#include "gputypes.h"
+
+#define FABS(a) ((a) > 0.0f ? (a) : -(a))
+
+static __constant__ cudaGmxSimulation cSim;
+
+void SetForcesSim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
+}
+
+void GetForcesSim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
+}
+
+__global__ void kClearForces_kernel()
+{
+    unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
+    while (pos < cSim.stride4 * cSim.outputBuffers)
+    {
+        ((float*)cSim.pForce4)[pos] = 0.0f;
+        pos += gridDim.x * blockDim.x;
+    }
+}
+
+void kClearForces(gpuContext gpu)
+{
+//    printf("kClearForces\n");
+    kClearForces_kernel<<<gpu->sim.blocks, 384>>>();
+    LAUNCHERROR("kClearForces");
+}
+
+__global__ void kClearBornForces_kernel()
+{
+    unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
+    while (pos < cSim.stride * cSim.nonbondOutputBuffers)
+    {
+        ((float*)cSim.pBornForce)[pos] = 0.0f;
+        pos += gridDim.x * blockDim.x;
+    }
+}
+
+void kClearBornForces(gpuContext gpu)
+{
+  //  printf("kClearBornForces\n");
+    kClearBornForces_kernel<<<gpu->sim.blocks, 384>>>();
+    LAUNCHERROR("kClearBornForces");
+}
+
+__global__ void kReduceBornSumAndForces_kernel()
+{
+    unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
+   
+    // Reduce forces
+    while (pos < cSim.stride4)
+    {
+        float totalForce = 0.0f;
+        float* pFt = (float*)cSim.pForce4 + pos;
+        int i = cSim.outputBuffers;
+        while (i >= 4)
+        {
+            float f1    = *pFt;
+            pFt        += cSim.stride4;
+            float f2    = *pFt;
+            pFt        += cSim.stride4;
+            float f3    = *pFt;
+            pFt        += cSim.stride4;
+            float f4    = *pFt;
+            pFt        += cSim.stride4;
+            totalForce += f1 + f2 + f3 + f4;
+            i -= 4;
+        }
+        if (i >= 2)
+        {
+            float f1    = *pFt;
+            pFt        += cSim.stride4;
+            float f2    = *pFt;
+            pFt        += cSim.stride4;
+            totalForce += f1 + f2;
+            i -= 2;
+        }
+        if (i > 0)
+        {
+            totalForce += *pFt;
+        }
+        
+        pFt = (float*)cSim.pForce4 + pos;
+        *pFt = totalForce;
+        pos += gridDim.x * blockDim.x;
+    }   
+    
+    
+    // Reduce Born Sum
+    while (pos - cSim.stride4 < cSim.atoms)
+    {
+        float sum = 0.0f;
+        float* pSt = cSim.pBornSum + pos - cSim.stride4;
+        float2 atom = cSim.pObcData[pos - cSim.stride4];
+        
+    
+        // Get summed Born data
+        int i = cSim.nonbondOutputBuffers;
+        while (i >= 4)
+        {
+            float f1    = *pSt;
+            pSt        += cSim.stride;
+            float f2    = *pSt;
+            pSt        += cSim.stride;
+            float f3    = *pSt;
+            pSt        += cSim.stride;
+            float f4    = *pSt;
+            pSt        += cSim.stride;
+            sum += f1 + f2 + f3 + f4;
+            i -= 4;
+        }
+        if (i >= 2)
+        {
+            float f1    = *pSt;
+            pSt        += cSim.stride;
+            float f2    = *pSt;
+            pSt        += cSim.stride;
+            sum += f1 + f2;
+            i -= 2;
+        }
+        if (i > 0)
+        {
+            sum += *pSt;
+        }
+       
+        // Now calculate Born radius and OBC term.
+        cSim.pBornSum[pos - cSim.stride4] = sum; 
+        sum                    *= 0.5f * atom.x;
+        float sum2              = sum * sum;
+        float sum3              = sum * sum2;
+        float tanhSum           = tanh(cSim.alphaOBC * sum - cSim.betaOBC * sum2 + cSim.gammaOBC * sum3);
+        float nonOffsetRadii    = atom.x + cSim.dielectricOffset;
+        float bornRadius        = 1.0f / (1.0f / atom.x - tanhSum / nonOffsetRadii); 
+        float obcChain          = atom.x * (cSim.alphaOBC - 2.0f * cSim.betaOBC * sum + 3.0f * cSim.gammaOBC * sum2);
+        obcChain                = (1.0f - tanhSum * tanhSum) * obcChain / nonOffsetRadii;              
+        cSim.pBornRadii[pos - cSim.stride4] = bornRadius;
+        cSim.pObcChain[pos - cSim.stride4]  = obcChain;
+        pos += gridDim.x * blockDim.x;
+    }
+}
+
+void kReduceBornSumAndForces(gpuContext gpu)
+{
+    //printf("kReduceBornSumAndForces\n");
+    kReduceBornSumAndForces_kernel<<<gpu->sim.blocks, gpu->sim.bsf_reduce_threads_per_block>>>();
+    LAUNCHERROR("kReduceBornSumAndForces");
+    
+#if 0
+    //gpuDumpObcLoop1( gpu );
+	 /*
+    gpu->psForce4->Download();
+    for (int i = 0; i < gpu->natoms; i++)
+    {
+        printf("%4d: %12.6f %12.6f %12.6f\n", i, 
+            gpu->psForce4->_pSysStream[0][i].x,
+            gpu->psForce4->_pSysStream[0][i].y,
+            gpu->psForce4->_pSysStream[0][i].z
+        );
+    } */
+#endif
+}
+
+__global__ void kReduceForces_kernel()
+{
+    unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
+   
+    // Reduce forces
+    while (pos < cSim.stride4)
+    {
+        float totalForce = 0.0f;
+        float* pFt = (float*)cSim.pForce4 + pos;
+        int i = cSim.outputBuffers;
+        while (i >= 4)
+        {
+            float f1    = *pFt;
+            pFt        += cSim.stride4;
+            float f2    = *pFt;
+            pFt        += cSim.stride4;
+            float f3    = *pFt;
+            pFt        += cSim.stride4;
+            float f4    = *pFt;
+            pFt        += cSim.stride4;
+            totalForce += f1 + f2 + f3 + f4;
+            i -= 4;
+        }
+        if (i >= 2)
+        {
+            float f1    = *pFt;
+            pFt        += cSim.stride4;
+            float f2    = *pFt;
+            pFt        += cSim.stride4;
+            totalForce += f1 + f2;
+            i -= 2;
+        }
+        if (i > 0)
+        {
+            totalForce += *pFt;
+        }
+        
+        pFt = (float*)cSim.pForce4 + pos;
+        *pFt = totalForce;
+        pos += gridDim.x * blockDim.x;
+    }   
+}
+
+void kReduceForces(gpuContext gpu)
+{
+ //   printf("kReduceForces\n");
+    kReduceForces_kernel<<<gpu->sim.blocks, gpu->sim.bsf_reduce_threads_per_block>>>();
+    LAUNCHERROR("kReduceForces");
+}
+
--- a/platforms/cuda/src/kernels/kRandom.cu
+++ b/platforms/cuda/src/kernels/kRandom.cu
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#include <cuda.h>
+#include <vector_functions.h>
+#include <cstdlib>
+#include <string>
+#include <iostream>
+#include <fstream>
+using namespace std;
+
+#include "gputypes.h"
+
+static __constant__ cudaGmxSimulation cSim;
+
+void SetRandomSim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
+}
+
+void GetRandomSim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
+}
+
+extern __shared__ float3 sRand[];
+
+
+__global__ void kGenerateRandoms_kernel()
+{
+    unsigned int pos            = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int increment      = blockDim.x * gridDim.x;
+    
+    // Read generator state
+    uint4 state                 = cSim.pRandomSeed[pos];
+    unsigned int carry          = 0;
+    
+    float4 random4;
+    float2 random2;
+    while (pos < cSim.totalRandomsTimesTwo)
+    {
+        
+        // Generate 6 randoms in GRF
+        unsigned int pos1       = threadIdx.x;
+        for (int i = 0; i < 2; i++)
+        {
+            state.x             = state.x * 69069 + 1;
+            state.y            ^= state.y << 13;
+            state.y            ^= state.y >> 17;
+            state.y            ^= state.y << 5;
+            unsigned int k      = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
+            unsigned int m      = state.w + state.w + state.z + carry;
+            state.z             = state.w;
+            state.w             = m;
+            carry               = k >> 30;
+            float x1            = (float)max(state.x + state.y + state.w, 0x00000001) / (float)0xffffffff;
+            state.x             = state.x * 69069 + 1;
+            state.y            ^= state.y << 13;
+            state.y            ^= state.y >> 17;
+            state.y            ^= state.y << 5;
+            x1                  = sqrt(-2.0f * log(x1));
+            k                   = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
+            m                   = state.w + state.w + state.z + carry;
+            state.z             = state.w;
+            state.w             = m;
+            carry               = k >> 30;
+            float x2            = (float)(state.x + state.y + state.w) / (float)0xffffffff;
+            
+            state.x             = state.x * 69069 + 1;
+            state.y            ^= state.y << 13;
+            state.y            ^= state.y >> 17;
+            state.y            ^= state.y << 5;
+            sRand[pos1].x       = x1 * cos(2.0f * 3.14159265f * x2);
+            k                   = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
+            m                   = state.w + state.w + state.z + carry;
+            state.z             = state.w;
+            state.w             = m;
+            carry               = k >> 30;
+            float x3            = (float)max(state.x + state.y + state.w, 0x00000001) / (float)0xffffffff;
+            state.x             = state.x * 69069 + 1;
+            state.y            ^= state.y << 13;
+            state.y            ^= state.y >> 17;
+            state.y            ^= state.y << 5;
+            x3                  = sqrt(-2.0f * log(x3));
+            k                   = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
+            m                   = state.w + state.w + state.z + carry;
+            state.z             = state.w;
+            state.w             = m;
+            carry               = k >> 30;
+            float x4            = (float)(state.x + state.y + state.w) / (float)0xffffffff;
+            
+            state.x             = state.x * 69069 + 1;
+            state.y            ^= state.y << 13;
+            state.y            ^= state.y >> 17;
+            state.y            ^= state.y << 5;
+            sRand[pos1].y       = x3 * cos(2.0f * 3.14159265f * x4);
+            k                   = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
+            m                   = state.w + state.w + state.z + carry;
+            state.z             = state.w;
+            state.w             = m;
+            carry               = k >> 30;
+            float x5            = (float)max(state.x + state.y + state.w, 0x00000001) / (float)0xffffffff;
+            state.x             = state.x * 69069 + 1;
+            state.y            ^= state.y << 13;
+            state.y            ^= state.y >> 17;
+            state.y            ^= state.y << 5;
+            x5                  = sqrt(-2.0f * log(x5));
+            k                   = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
+            m                   = state.w + state.w + state.z + carry;
+            state.z             = state.w;
+            state.w             = m;
+            carry               = k >> 30;
+            float x6            = (float)(state.x + state.y + state.w) / (float)0xffffffff;
+            sRand[pos1].z       = x5 * cos(2.0f * 3.14159265f * x6); 
+            pos1               += blockDim.x;
+        }
+        
+        // Output final randoms
+        float c1, c2;
+        if (pos < cSim.totalRandoms)
+        {
+            c1                  = cSim.Yv;
+            c2                  = cSim.V;
+        }
+        else
+        {
+            c1                  = cSim.Yx;
+            c2                  = cSim.X;
+        }
+        random4.x               = c1 * sRand[threadIdx.x].x;
+        random4.y               = c1 * sRand[threadIdx.x].y;
+        random4.z               = c1 * sRand[threadIdx.x].z;
+        random4.w               = c2 * sRand[threadIdx.x + blockDim.x].x;
+        cSim.pRandom4a[pos]     = random4;
+        random2.x               = c2 * sRand[threadIdx.x + blockDim.x].y;
+        random2.y               = c2 * sRand[threadIdx.x + blockDim.x].z;
+        cSim.pRandom2a[pos]     = random2;
+        
+   
+        pos += increment;
+    }
+    
+    
+    // Write generator state
+    pos                     = blockIdx.x * blockDim.x + threadIdx.x;
+    cSim.pRandomSeed[pos]   = state;
+}
+
+void kGenerateRandoms(gpuContext gpu)
+{
+    kGenerateRandoms_kernel<<<gpu->sim.blocks, gpu->sim.random_threads_per_block, gpu->sim.random_threads_per_block * 2 * sizeof(float3)>>>();
+}
\ No newline at end of file
--- a/platforms/cuda/src/kernels/kUpdateShakeH.cu
+++ b/platforms/cuda/src/kernels/kUpdateShakeH.cu
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#include <cuda.h>
+#include <vector_functions.h>
+#include <cstdlib>
+#include <string>
+#include <iostream>
+//#include <fstream>
+using namespace std;
+
+#define DeltaShake
+
+#include "gputypes.h"
+
+struct Atom 
+{
+    float3 rij1;
+    float3 rij2;
+    float3 rij3;
+    float  M;
+    float  d2;
+    float  InvMassI;
+    float  rij1sq;
+    float  rij2sq;
+    float  rij3sq;
+};
+
+
+static __constant__ cudaGmxSimulation cSim;
+
+void SetUpdateShakeHSim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
+}
+
+void GetUpdateShakeHSim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
+}
+
+__global__ void kUpdatePart1_kernel()
+{
+    unsigned int pos    = threadIdx.x + blockIdx.x * blockDim.x;
+    unsigned int rpos   = cSim.pRandomPosition[blockIdx.x];
+    __syncthreads();
+    
+    while (pos < cSim.atoms)
+    {
+        float4 velocity         = cSim.pVelm4[pos];
+        float4 xVector          = cSim.pxVector4[pos];
+        float4 random4a         = cSim.pRandom4a[rpos + pos];
+        float2 random2a         = cSim.pRandom2a[rpos + pos];
+        float4 apos             = cSim.pPosq[pos];
+        float4 force            = cSim.pForce4[pos];
+        
+        float3 Vmh;
+        float sqrtInvMass       = sqrt(velocity.w);
+        Vmh.x                   = xVector.x * cSim.DOverTauC + sqrtInvMass * random4a.x;
+        Vmh.y                   = xVector.y * cSim.DOverTauC + sqrtInvMass * random4a.y;
+        Vmh.z                   = xVector.z * cSim.DOverTauC + sqrtInvMass * random4a.z;
+        float4 vVector;
+        vVector.x               = sqrtInvMass * random4a.w;
+        vVector.y               = sqrtInvMass * random2a.x;
+        vVector.z               = sqrtInvMass * random2a.y;
+        vVector.w               = 0.0f;
+        cSim.pvVector4[pos]     = vVector;
+        velocity.x              = velocity.x * cSim.EM + 
+                                  velocity.w * force.x * cSim.TauOneMinusEM +
+                                  vVector.x -
+                                  cSim.EM * Vmh.x;
+        velocity.y              = velocity.y * cSim.EM + 
+                                  velocity.w * force.y * cSim.TauOneMinusEM +
+                                  vVector.y -
+                                  cSim.EM * Vmh.y;
+        velocity.z              = velocity.z * cSim.EM + 
+                                  velocity.w * force.z * cSim.TauOneMinusEM +
+                                  vVector.z -
+                                  cSim.EM * Vmh.z;
+        cSim.pOldPosq[pos]      = apos;
+#ifndef DeltaShake
+        apos.x                 += velocity.x * cSim.fix1;
+        apos.y                 += velocity.y * cSim.fix1;
+        apos.z                 += velocity.z * cSim.fix1;
+#else
+        apos.x                  = velocity.x * cSim.fix1;
+        apos.y                  = velocity.y * cSim.fix1;
+        apos.z                  = velocity.z * cSim.fix1;
+#endif
+        cSim.pPosqP[pos]        = apos;
+        cSim.pVelm4[pos]        = velocity;        
+        pos                    += blockDim.x * gridDim.x;
+    }
+}
+
+__global__ void kUpdatePart1CM_kernel()
+{
+    extern __shared__ float3 sCM[];
+    unsigned int pos    = threadIdx.x + blockIdx.x * blockDim.x;
+    unsigned int rpos   = cSim.pRandomPosition[blockIdx.x];
+    float3 CM           = { 0.0f, 0.0f, 0.0f};
+    float4 CM1          = { 0.0f, 0.0f, 0.0f, 0.0f };
+    
+    // Read CM outputs from previous step
+    unsigned int cpos = threadIdx.x;
+#if 0
+    float4 CM2          = { 0.0f, 0.0f, 0.0f, 0.0f };
+    float4 CM3          = { 0.0f, 0.0f, 0.0f, 0.0f };
+    float4 CM4          = { 0.0f, 0.0f, 0.0f, 0.0f };
+    if (cpos < gridDim.x)
+        CM1             = cSim.pLinearMomentum[cpos];
+    cpos               += gridDim.x;
+    if (cpos < gridDim.x)
+        CM2             = cSim.pLinearMomentum[cpos];
+    cpos               += gridDim.x;
+    if (cpos < gridDim.x)
+        CM3             = cSim.pLinearMomentum[cpos];
+    cpos               += gridDim.x;
+    if (cpos < gridDim.x)
+        CM4             = cSim.pLinearMomentum[cpos];
+    sCM[threadIdx.x].x  = CM1.x + CM2.x + CM3.x + CM4.x;
+    sCM[threadIdx.x].y  = CM1.y + CM2.y + CM3.y + CM4.y;
+    sCM[threadIdx.x].z  = CM1.z + CM2.z + CM3.z + CM4.z;
+#else
+    while (cpos < gridDim.x)
+    {
+        CM1             = cSim.pLinearMomentum[cpos];
+        CM.x           += CM1.x;
+        CM.y           += CM1.y;
+        CM.z           += CM1.z;
+        cpos           += blockDim.x;
+    }
+    sCM[threadIdx.x].x  = CM.x;
+    sCM[threadIdx.x].y  = CM.y;
+    sCM[threadIdx.x].z  = CM.z;
+#endif
+    __syncthreads();
+    
+    // Reduce CM
+    unsigned int offset = 1;
+    unsigned int mask   = 1;
+    while (offset < blockDim.x)
+    {
+        if (((threadIdx.x & mask) == 0) && (threadIdx.x + offset < blockDim.x))
+        {
+            sCM[threadIdx.x].x += sCM[threadIdx.x + offset].x;
+            sCM[threadIdx.x].y += sCM[threadIdx.x + offset].y;
+            sCM[threadIdx.x].z += sCM[threadIdx.x + offset].z;
+        }
+        mask = 2 * mask + 1;
+        offset *= 2;
+        __syncthreads();
+    }       
+    
+    while (pos < cSim.atoms)
+    {
+        float4 velocity         = cSim.pVelm4[pos];
+        float4 xVector          = cSim.pxVector4[pos];
+        float4 random4a         = cSim.pRandom4a[rpos + pos];
+        float2 random2a         = cSim.pRandom2a[rpos + pos];
+        float4 apos             = cSim.pPosq[pos];
+        float4 force            = cSim.pForce4[pos];
+        
+        float3 Vmh;
+        float sqrtInvMass       = sqrt(velocity.w);
+        Vmh.x                   = xVector.x * cSim.DOverTauC + sqrtInvMass * random4a.x;
+        Vmh.y                   = xVector.y * cSim.DOverTauC + sqrtInvMass * random4a.y;
+        Vmh.z                   = xVector.z * cSim.DOverTauC + sqrtInvMass * random4a.z;
+        float4 vVector;
+        vVector.x               = sqrtInvMass * random4a.w;
+        vVector.y               = sqrtInvMass * random2a.x;
+        vVector.z               = sqrtInvMass * random2a.y;
+        vVector.w               = 0.0f;
+        cSim.pvVector4[pos]     = vVector;
+        velocity.x              = velocity.x * cSim.EM + 
+                                  velocity.w * force.x * cSim.TauOneMinusEM +
+                                  vVector.x -
+                                  cSim.EM * Vmh.x -
+                                  sCM[0].x;
+        velocity.y              = velocity.y * cSim.EM + 
+                                  velocity.w * force.y * cSim.TauOneMinusEM +
+                                  vVector.y -
+                                  cSim.EM * Vmh.y -
+                                  sCM[0].y;
+        velocity.z              = velocity.z * cSim.EM + 
+                                  velocity.w * force.z * cSim.TauOneMinusEM +
+                                  vVector.z -
+                                  cSim.EM * Vmh.z -
+                                  sCM[0].z;
+        cSim.pOldPosq[pos]      = apos;
+#ifndef DeltaShake
+        apos.x                 += velocity.x * cSim.fix1;
+        apos.y                 += velocity.y * cSim.fix1;
+        apos.z                 += velocity.z * cSim.fix1;
+#else
+        apos.x                  = velocity.x * cSim.fix1;
+        apos.y                  = velocity.y * cSim.fix1;
+        apos.z                  = velocity.z * cSim.fix1;
+#endif
+        cSim.pPosqP[pos]        = apos;
+        cSim.pVelm4[pos]        = velocity;        
+        pos                    += blockDim.x * gridDim.x;
+    }
+}
+
+
+
+void kUpdatePart1(gpuContext gpu)
+{
+//    printf("kUpdatePart1\n");
+#if 0
+    static int iteration = 0;
+    if (iteration == 0)
+    {
+        gpu->psPosq4->Download();
+        gpu->psVelm4->Download();
+        printf("# %d atoms\n", gpu->natoms);
+        for (int i = 0; i < gpu->natoms; i++)
+        {
+            printf("%5d %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f\n", i,
+                gpu->psPosq4->_pSysStream[0][i].x, gpu->psPosq4->_pSysStream[0][i].y,
+                gpu->psPosq4->_pSysStream[0][i].z, gpu->psPosq4->_pSysStream[0][i].w,
+                gpu->psVelm4->_pSysStream[0][i].x, gpu->psVelm4->_pSysStream[0][i].y,
+                gpu->psVelm4->_pSysStream[0][i].z, gpu->psVelm4->_pSysStream[0][i].w
+            );       
+        }
+    }
+    iteration++;
+#endif
+#if 0
+    static const float KILO 		        =    1e3;              		// Thousand
+    static const float BOLTZMANN	        =    1.380658e-23f;            // (J/K)	
+    static const float AVOGADRO	            =    6.0221367e23f;		    // ()		
+    static const float RGAS                 =    BOLTZMANN * AVOGADRO;     // (J/(mol K))
+    static const float BOLTZ                =    (RGAS / KILO);            // (kJ/(mol K)) 
+    static int iteration = 0;
+
+    // Check T
+    if (iteration % 1000 == 0)
+    {
+        gpu->psVelm4->Download();
+        float ke = 0.0f;
+        for (int i = 0; i < gpu->natoms; i++)
+        {
+            float vx = gpu->psVelm4->_pSysStream[0][i].x;
+            float vy = gpu->psVelm4->_pSysStream[0][i].y;
+            float vz = gpu->psVelm4->_pSysStream[0][i].z;
+            float m = 1.0f / gpu->psVelm4->_pSysStream[0][i].w;
+            ke += m * (vx * vx + vy * vy + vz * vz);
+        }
+        float T = ke / (BOLTZ  * gpu->sim.degreesOfFreedom);
+        printf("Iteration %d, Temperature is %f\n", iteration, T);
+    }
+    iteration++;
+#endif    
+    if (gpu->bRemoveCM)
+    {
+        kUpdatePart1CM_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block, gpu->sim.update_threads_per_block * sizeof(float3)>>>();
+        LAUNCHERROR("kUpdatePart1CM");
+        gpu->bRemoveCM = false;
+
+#if 0
+        gpu->psLinearMomentum->Download();
+        gpu->psVelm4->Download();
+        float3 mv = {0.0f, 0.0f, 0.0f};
+        for (int i = 0; i < gpu->natoms; i++)
+        {
+            float mass = 1.0f / gpu->psVelm4->_pSysStream[0][i].w;
+            mv.x += mass * gpu->psVelm4->_pSysStream[0][i].x;
+            mv.y += mass * gpu->psVelm4->_pSysStream[0][i].y;
+            mv.z += mass * gpu->psVelm4->_pSysStream[0][i].z;
+        }
+        mv.x *= gpu->sim.inverseTotalMass;
+        mv.y *= gpu->sim.inverseTotalMass;
+        mv.z *= gpu->sim.inverseTotalMass;
+        
+        float3 mv1 = {0.0f, 0.0f, 0.0f};
+        for (int i = 0; i < gpu->sim.blocks; i++)
+        {
+            mv1.x += gpu->psLinearMomentum->_pSysStream[0][i].x;
+            mv1.y += gpu->psLinearMomentum->_pSysStream[0][i].y;
+            mv1.z += gpu->psLinearMomentum->_pSysStream[0][i].z;
+        }
+        printf("%11.5f %11.5f %11.5f | %11.5f %11.5f %11.5f\n", mv.x, mv.y, mv.z, mv1.x, mv1.y, mv1.z);
+#endif
+    }
+    else
+    {    
+        kUpdatePart1_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
+        LAUNCHERROR("kUpdatePart1");
+    }
+}
+
+__global__ void kApplyFirstShake_kernel()
+{
+    __shared__ Atom sA[G8X_THREADS_PER_BLOCK];
+    Atom* psA = &sA[threadIdx.x];
+    unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
+    while (pos < cSim.ShakeConstraints)
+    {
+        int4 atomID         = cSim.pShakeID[pos];
+        float4 params       = cSim.pShakeParameter[pos];
+        float4 apos         = cSim.pOldPosq[atomID.x];
+        float4 xpi          = cSim.pPosqP[atomID.x];
+        float4 apos1        = cSim.pOldPosq[atomID.y];
+        float4 xpj1         = cSim.pPosqP[atomID.y];
+        float4 apos2        = {0.0f, 0.0f, 0.0f, 0.0f};
+        float4 xpj2         = {0.0f, 0.0f, 0.0f, 0.0f};
+        psA->InvMassI       = params.x;
+        psA->M              = params.y;
+        psA->d2             = params.z;
+        float invMassJ      = params.w;
+        if (atomID.z != -1)
+        {
+            apos2           = cSim.pOldPosq[atomID.z]; 
+            xpj2            = cSim.pPosqP[atomID.z];
+        }   
+        float4 apos3        = {0.0f, 0.0f, 0.0f, 0.0f};
+        float4 xpj3         = {0.0f, 0.0f, 0.0f, 0.0f};
+        if (atomID.w != -1)
+        {
+            apos3           = cSim.pOldPosq[atomID.w]; 
+            xpj3            = cSim.pPosqP[atomID.w];
+        }    
+       
+        float3 xi, xj1, xj2, xj3;
+        xi.x                = apos.x;
+        xi.y                = apos.y;
+        xi.z                = apos.z;
+        xj1.x               = apos1.x;
+        xj1.y               = apos1.y;
+        xj1.z               = apos1.z;
+        xj2.x               = apos2.x;
+        xj2.y               = apos2.y;
+        xj2.z               = apos2.z;
+        xj3.x               = apos3.x;
+        xj3.y               = apos3.y;
+        xj3.z               = apos3.z;
+#ifndef DeltaShake
+        xpi.x              -= xi.x;
+        xpi.y              -= xi.y;
+        xpi.z              -= xi.z;
+        xpj1.x             -= xj1.x;
+        xpj1.y             -= xj1.y;
+        xpj1.z             -= xj1.z;
+        xpj2.x             -= xj2.x;
+        xpj2.y             -= xj2.y;
+        xpj2.z             -= xj2.z;
+        xpj3.x             -= xj3.x;
+        xpj3.y             -= xj3.y;
+        xpj3.z             -= xj3.z;
+#endif
+        psA->rij1.x         = xi.x - xj1.x;
+        psA->rij1.y         = xi.y - xj1.y;
+        psA->rij1.z         = xi.z - xj1.z;
+        psA->rij2.x         = xi.x - xj2.x;
+        psA->rij2.y         = xi.y - xj2.y;
+        psA->rij2.z         = xi.z - xj2.z;
+        psA->rij3.x         = xi.x - xj3.x;
+        psA->rij3.y         = xi.y - xj3.y;
+        psA->rij3.z         = xi.z - xj3.z;
+        psA->rij1sq         = psA->rij1.x * psA->rij1.x + psA->rij1.y * psA->rij1.y + psA->rij1.z * psA->rij1.z;
+        psA->rij2sq         = psA->rij2.x * psA->rij2.x + psA->rij2.y * psA->rij2.y + psA->rij2.z * psA->rij2.z;
+        psA->rij3sq         = psA->rij3.x * psA->rij3.x + psA->rij3.y * psA->rij3.y + psA->rij3.z * psA->rij3.z;
+        float ld1           = psA->d2 - psA->rij1sq;
+        float ld2           = psA->d2 - psA->rij2sq;
+        float ld3           = psA->d2 - psA->rij3sq;
+        
+        
+        bool converged = false;
+        int iteration = 0;
+        while (iteration < 15 && !converged)
+        {
+            converged = true;
+            float3 rpij;
+            rpij.x          = xpi.x - xpj1.x;
+            rpij.y          = xpi.y - xpj1.y;
+            rpij.z          = xpi.z - xpj1.z;
+		    float rpsqij    = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
+		    float rrpr      = psA->rij1.x * rpij.x + psA->rij1.y * rpij.y + psA->rij1.z * rpij.z; 
+		    float diff      = fabs(ld1 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance);
+            if (diff >= 1.0f)
+            {
+                float acor  = (ld1 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij1sq);
+                float3 dr;
+                dr.x    = psA->rij1.x * acor;
+                dr.y    = psA->rij1.y * acor;
+                dr.z    = psA->rij1.z * acor;
+		        xpi.x  += dr.x * psA->InvMassI;
+		        xpi.y  += dr.y * psA->InvMassI;
+		        xpi.z  += dr.z * psA->InvMassI;
+		        xpj1.x -= dr.x * invMassJ;
+		        xpj1.y -= dr.y * invMassJ;
+		        xpj1.z -= dr.z * invMassJ;
+                converged = false;
+            }
+            
+            if (atomID.z != -1)
+            {
+                rpij.x          = xpi.x - xpj2.x;
+                rpij.y          = xpi.y - xpj2.y;
+                rpij.z          = xpi.z - xpj2.z;
+		        rpsqij          = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
+		        rrpr            = psA->rij2.x * rpij.x + psA->rij2.y * rpij.y + psA->rij2.z * rpij.z; 
+		        diff            = fabs(ld2 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance);
+                if (diff >= 1.0f)
+                {
+                    float acor  = (ld2 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij2sq);
+                    float3 dr;
+                    dr.x    = psA->rij2.x * acor;
+                    dr.y    = psA->rij2.y * acor;
+                    dr.z    = psA->rij2.z * acor;
+		            xpi.x  += dr.x * psA->InvMassI;
+		            xpi.y  += dr.y * psA->InvMassI;
+		            xpi.z  += dr.z * psA->InvMassI;
+		            xpj2.x -= dr.x * invMassJ;
+		            xpj2.y -= dr.y * invMassJ;
+		            xpj2.z -= dr.z * invMassJ;
+                    converged = false;
+                }
+            }
+            
+            if (atomID.w != -1)
+            {
+                rpij.x          = xpi.x - xpj3.x;
+                rpij.y          = xpi.y - xpj3.y;
+                rpij.z          = xpi.z - xpj3.z;
+		        rpsqij          = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
+		        rrpr            = psA->rij3.x * rpij.x + psA->rij3.y * rpij.y + psA->rij3.z * rpij.z; 
+		        diff            = fabs(ld3 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance);
+                if (diff >= 1.0f)
+                {
+                    float acor  = (ld3 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij3sq);
+                    float3 dr;
+                    dr.x    = psA->rij3.x * acor;
+                    dr.y    = psA->rij3.y * acor;
+                    dr.z    = psA->rij3.z * acor;
+		            xpi.x  += dr.x * psA->InvMassI;
+		            xpi.y  += dr.y * psA->InvMassI;
+		            xpi.z  += dr.z * psA->InvMassI;
+		            xpj3.x -= dr.x * invMassJ;
+		            xpj3.y -= dr.y * invMassJ;
+		            xpj3.z -= dr.z * invMassJ;
+                    converged = false;
+                }
+            }
+            iteration++;
+        }
+        
+#ifndef DeltaShake
+        xpi.x  += xi.x;
+        xpi.y  += xi.y;
+        xpi.z  += xi.z;
+
+        xpj1.x += xj1.x;
+        xpj1.y += xj1.y;
+        xpj1.z += xj1.z;
+        
+        xpj2.x += xj2.x;
+        xpj2.y += xj2.y;
+        xpj2.z += xj2.z;
+        
+        xpj3.x += xj3.x;
+        xpj3.y += xj3.y;
+        xpj3.z += xj3.z;
+#endif
+        cSim.pPosqP[atomID.x] = xpi;
+        cSim.pPosqP[atomID.y] = xpj1;
+        if (atomID.z != -1)
+            cSim.pPosqP[atomID.z] = xpj2;
+        if (atomID.w != -1)
+            cSim.pPosqP[atomID.w] = xpj3;     
+   
+        pos += blockDim.x * gridDim.x;
+    }
+}
+
+void kApplyFirstShake(gpuContext gpu)
+{
+//    printf("kApplyFirstShake\n");       
+    if (gpu->sim.ShakeConstraints > 0)
+    {    
+        kApplyFirstShake_kernel<<<gpu->sim.blocks, gpu->sim.shake_threads_per_block>>>();
+        LAUNCHERROR("kApplyFirstShake");
+    }
+}
+
+
+__global__ void kUpdatePart2_kernel()
+{
+    unsigned int pos            = threadIdx.x + blockIdx.x * blockDim.x;
+    unsigned int rpos           = cSim.pRandomPosition[blockIdx.x];
+    __syncthreads();
+    
+    while (pos < cSim.atoms)
+    {
+        float4 velocity         = cSim.pVelm4[pos];
+#ifndef DeltaShake
+        float4 apos             = cSim.pPosq[pos];
+#endif
+        float4 xPrime           = cSim.pPosqP[pos];
+        float4 vVector          = cSim.pvVector4[pos];
+        float4 xVector;
+        float4 random4b         = cSim.pRandom4b[rpos + pos];
+        float2 random2b         = cSim.pRandom2b[rpos + pos];
+        float3 Xmh;
+        
+        float sqrtInvMass       = sqrt(velocity.w);
+#ifdef DeltaShake
+        velocity.x              = xPrime.x * cSim.oneOverFix1;
+        velocity.y              = xPrime.y * cSim.oneOverFix1;
+        velocity.z              = xPrime.z * cSim.oneOverFix1;
+#else
+        velocity.x              = (xPrime.x - apos.x) * cSim.oneOverFix1;
+        velocity.y              = (xPrime.y - apos.y) * cSim.oneOverFix1;
+        velocity.z              = (xPrime.z - apos.z) * cSim.oneOverFix1;
+#endif
+        Xmh.x                   = vVector.x * cSim.TauDOverEMMinusOne +
+                                  sqrtInvMass * random4b.x;
+        Xmh.y                   = vVector.y * cSim.TauDOverEMMinusOne +
+                                  sqrtInvMass * random4b.y;
+        Xmh.z                   = vVector.z * cSim.TauDOverEMMinusOne +
+                                  sqrtInvMass * random4b.z;
+        xVector.x               = sqrtInvMass * random4b.w;
+        xVector.y               = sqrtInvMass * random2b.x;
+        xVector.z               = sqrtInvMass * random2b.y;                    
+        xPrime.x               += xVector.x - Xmh.x;
+        xPrime.y               += xVector.y - Xmh.y;
+        xPrime.z               += xVector.z - Xmh.z;
+        
+    
+        cSim.pPosq[pos]         = xPrime;
+        cSim.pVelm4[pos]        = velocity;
+        cSim.pxVector4[pos]     = xVector;
+         
+        pos                    += blockDim.x * gridDim.x;    
+    }
+
+    // Update random position pointer
+    if (threadIdx.x == 0)
+    {
+        rpos                   += cSim.paddedNumberOfAtoms;
+        if (rpos > cSim.randoms)
+            rpos               -= cSim.randoms;
+        cSim.pRandomPosition[blockIdx.x] = rpos;
+    }
+}
+
+__global__ void kUpdatePart2CM_kernel()
+{
+    extern __shared__ float3 sCM[];
+    unsigned int pos            = threadIdx.x + blockIdx.x * blockDim.x;
+    unsigned int rpos           = cSim.pRandomPosition[blockIdx.x];
+    float3 CM                   = {0.0f, 0.0f, 0.0f};
+    __syncthreads();
+    
+    while (pos < cSim.atoms)
+    {
+        float4 velocity         = cSim.pVelm4[pos];
+#ifndef DeltaShake
+        float4 apos             = cSim.pPosq[pos];
+#endif
+        float4 xPrime           = cSim.pPosqP[pos];
+        float4 vVector          = cSim.pvVector4[pos];
+        float4 xVector;
+        float4 random4b         = cSim.pRandom4b[rpos + pos];
+        float2 random2b         = cSim.pRandom2b[rpos + pos];
+        float3 Xmh;
+        float mass              = 1.0f / velocity.w;
+        float sqrtInvMass       = sqrt(velocity.w);
+#ifdef DeltaShake
+        velocity.x              = xPrime.x * cSim.oneOverFix1;
+        velocity.y              = xPrime.y * cSim.oneOverFix1;
+        velocity.z              = xPrime.z * cSim.oneOverFix1;
+#else
+        velocity.x              = (xPrime.x - apos.x) * cSim.oneOverFix1;
+        velocity.y              = (xPrime.y - apos.y) * cSim.oneOverFix1;
+        velocity.z              = (xPrime.z - apos.z) * cSim.oneOverFix1;
+#endif
+        CM.x                   += mass * velocity.x;
+        CM.y                   += mass * velocity.y;
+        CM.z                   += mass * velocity.z;
+        
+        Xmh.x                   = vVector.x * cSim.TauDOverEMMinusOne +
+                                  sqrtInvMass * random4b.x;
+        Xmh.y                   = vVector.y * cSim.TauDOverEMMinusOne +
+                                  sqrtInvMass * random4b.y;
+        Xmh.z                   = vVector.z * cSim.TauDOverEMMinusOne +
+                                  sqrtInvMass * random4b.z;
+        xVector.x               = sqrtInvMass * random4b.w;
+        xVector.y               = sqrtInvMass * random2b.x;
+        xVector.z               = sqrtInvMass * random2b.y;                    
+        xPrime.x               += xVector.x - Xmh.x;
+        xPrime.y               += xVector.y - Xmh.y;
+        xPrime.z               += xVector.z - Xmh.z;
+        
+    
+        cSim.pPosq[pos]         = xPrime;
+        cSim.pVelm4[pos]        = velocity;
+        cSim.pxVector4[pos]     = xVector;
+        
+        pos                    += blockDim.x * gridDim.x;    
+    }
+
+    // Update random position pointer
+    if (threadIdx.x == 0)
+    {
+        rpos                   += cSim.paddedNumberOfAtoms;
+        if (rpos > cSim.randoms)
+            rpos               -= cSim.randoms;
+        cSim.pRandomPosition[blockIdx.x] = rpos;
+    }
+    
+    // Scale CM
+    CM.x *= cSim.inverseTotalMass;
+    CM.y *= cSim.inverseTotalMass;
+    CM.z *= cSim.inverseTotalMass;
+    sCM[threadIdx.x] = CM;
+    __syncthreads();
+    
+    // Reduce CM for CTA
+    unsigned int offset = 1;
+    unsigned int mask   = 1;
+    while (offset < blockDim.x)
+    {
+        if (((threadIdx.x & mask) == 0) && (threadIdx.x + offset < blockDim.x))
+        {
+            sCM[threadIdx.x].x += sCM[threadIdx.x + offset].x;
+            sCM[threadIdx.x].y += sCM[threadIdx.x + offset].y;
+            sCM[threadIdx.x].z += sCM[threadIdx.x + offset].z;
+        }
+        mask = 2 * mask + 1;
+        offset *= 2;
+        __syncthreads();
+    }
+    if (threadIdx.x == 0)
+    {
+        float4 CM;
+        CM.x                                = sCM[0].x;
+        CM.y                                = sCM[0].y;
+        CM.z                                = sCM[0].z;
+        CM.w                                = 0.0f;
+        cSim.pLinearMomentum[blockIdx.x]    = CM;
+    }  
+}
+
+extern void kGenerateRandoms(gpuContext gpu);
+void kUpdatePart2(gpuContext gpu)
+{
+//    printf("kUpdatePart2\n");
+    if (gpu->bCalculateCM)
+    {
+        kUpdatePart2CM_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block, gpu->sim.update_threads_per_block * sizeof(float3)>>>();
+        LAUNCHERROR("kUpdatePart2CM");
+        gpu->bCalculateCM = false;
+        gpu->bRemoveCM = true;
+       
+#if 0
+        gpu->psLinearMomentum->Download();
+        gpu->psVelm4->Download();
+        float3 mv = {0.0f, 0.0f, 0.0f};
+        for (int i = 0; i < gpu->natoms; i++)
+        {
+            float mass = 1.0f / gpu->psVelm4->_pSysStream[0][i].w;
+            mv.x += mass * gpu->psVelm4->_pSysStream[0][i].x;
+            mv.y += mass * gpu->psVelm4->_pSysStream[0][i].y;
+            mv.z += mass * gpu->psVelm4->_pSysStream[0][i].z;
+        }
+        mv.x *= gpu->sim.inverseTotalMass;
+        mv.y *= gpu->sim.inverseTotalMass;
+        mv.z *= gpu->sim.inverseTotalMass;
+        
+        float3 mv1 = {0.0f, 0.0f, 0.0f};
+        for (int i = 0; i < gpu->sim.blocks; i++)
+        {
+            mv1.x += gpu->psLinearMomentum->_pSysStream[0][i].x;
+            mv1.y += gpu->psLinearMomentum->_pSysStream[0][i].y;
+            mv1.z += gpu->psLinearMomentum->_pSysStream[0][i].z;
+        }
+        printf("%11.5f %11.5f %11.5f | %11.5f %11.5f %11.5f\n", mv.x, mv.y, mv.z, mv1.x, mv1.y, mv1.z);
+#endif
+    }
+    else
+    {
+        kUpdatePart2_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
+        LAUNCHERROR("kUpdatePart2");
+    }
+    
+    // Update randoms if necessary
+    static int iteration = 0;
+    iteration++;
+    if (iteration == gpu->sim.randomIterations)
+    {
+        kGenerateRandoms(gpu);
+        iteration = 0;
+    }
+}
+
+
+__global__ void kApplySecondShake_kernel()
+{
+    __shared__ Atom sA[G8X_THREADS_PER_BLOCK];
+    Atom* psA = &sA[threadIdx.x];
+    unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
+    while (pos < cSim.ShakeConstraints)
+    {
+        int4 atomID         = cSim.pShakeID[pos];
+        float4 params       = cSim.pShakeParameter[pos];
+        float4 apos         = cSim.pOldPosq[atomID.x];
+        float4 xpi          = cSim.pPosq[atomID.x];
+        float4 apos1        = cSim.pOldPosq[atomID.y];
+        float4 xpj1         = cSim.pPosq[atomID.y];
+        float4 apos2        = {0.0f, 0.0f, 0.0f, 0.0f};
+        float4 xpj2         = {0.0f, 0.0f, 0.0f, 0.0f};
+        psA->InvMassI       = params.x;
+        psA->M              = params.y;
+        psA->d2             = params.z;
+        float invMassJ      = params.w;
+        if (atomID.z != -1)
+        {
+            apos2           = cSim.pOldPosq[atomID.z]; 
+            xpj2            = cSim.pPosq[atomID.z];
+        }   
+        float4 apos3        = {0.0f, 0.0f, 0.0f, 0.0f};
+        float4 xpj3         = {0.0f, 0.0f, 0.0f, 0.0f};
+        if (atomID.w != -1)
+        {
+            apos3           = cSim.pOldPosq[atomID.w]; 
+            xpj3            = cSim.pPosq[atomID.w];
+        }    
+       
+        float3 xi, xj1, xj2, xj3;
+        xi.x                = apos.x;
+        xi.y                = apos.y;
+        xi.z                = apos.z;
+        xj1.x               = apos1.x;
+        xj1.y               = apos1.y;
+        xj1.z               = apos1.z;
+        xj2.x               = apos2.x;
+        xj2.y               = apos2.y;
+        xj2.z               = apos2.z;
+        xj3.x               = apos3.x;
+        xj3.y               = apos3.y;
+        xj3.z               = apos3.z;
+#ifndef DeltaShake
+        xpi.x              -= xi.x;
+        xpi.y              -= xi.y;
+        xpi.z              -= xi.z;
+        xpj1.x             -= xj1.x;
+        xpj1.y             -= xj1.y;
+        xpj1.z             -= xj1.z;
+        xpj2.x             -= xj2.x;
+        xpj2.y             -= xj2.y;
+        xpj2.z             -= xj2.z;
+        xpj3.x             -= xj3.x;
+        xpj3.y             -= xj3.y;
+        xpj3.z             -= xj3.z;
+#endif
+        psA->rij1.x         = xi.x - xj1.x;
+        psA->rij1.y         = xi.y - xj1.y;
+        psA->rij1.z         = xi.z - xj1.z;
+        psA->rij2.x         = xi.x - xj2.x;
+        psA->rij2.y         = xi.y - xj2.y;
+        psA->rij2.z         = xi.z - xj2.z;
+        psA->rij3.x         = xi.x - xj3.x;
+        psA->rij3.y         = xi.y - xj3.y;
+        psA->rij3.z         = xi.z - xj3.z;
+        psA->rij1sq         = psA->rij1.x * psA->rij1.x + psA->rij1.y * psA->rij1.y + psA->rij1.z * psA->rij1.z;
+        psA->rij2sq         = psA->rij2.x * psA->rij2.x + psA->rij2.y * psA->rij2.y + psA->rij2.z * psA->rij2.z;
+        psA->rij3sq         = psA->rij3.x * psA->rij3.x + psA->rij3.y * psA->rij3.y + psA->rij3.z * psA->rij3.z;
+        float ld1           = psA->d2 - psA->rij1sq;
+        float ld2           = psA->d2 - psA->rij2sq;
+        float ld3           = psA->d2 - psA->rij3sq;
+        
+        
+        bool converged = false;
+        int iteration = 0;
+        while (iteration < 15 && !converged)
+        {
+            converged = true;
+            float3 rpij;
+            rpij.x          = xpi.x - xpj1.x;
+            rpij.y          = xpi.y - xpj1.y;
+            rpij.z          = xpi.z - xpj1.z;
+		    float rpsqij    = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
+		    float rrpr      = psA->rij1.x * rpij.x + psA->rij1.y * rpij.y + psA->rij1.z * rpij.z; 
+		    float diff      = fabs(ld1 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance );
+            if (diff >= 1.0f)
+            {
+                float acor  = (ld1 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij1sq);
+                float3 dr;
+                dr.x    = psA->rij1.x * acor;
+                dr.y    = psA->rij1.y * acor;
+                dr.z    = psA->rij1.z * acor;
+		        xpi.x  += dr.x * psA->InvMassI;
+		        xpi.y  += dr.y * psA->InvMassI;
+		        xpi.z  += dr.z * psA->InvMassI;
+		        xpj1.x -= dr.x * invMassJ;
+		        xpj1.y -= dr.y * invMassJ;
+		        xpj1.z -= dr.z * invMassJ;
+                converged = false;
+            }
+            
+            if (atomID.z != -1)
+            {
+                rpij.x          = xpi.x - xpj2.x;
+                rpij.y          = xpi.y - xpj2.y;
+                rpij.z          = xpi.z - xpj2.z;
+		        rpsqij          = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
+		        rrpr            = psA->rij2.x * rpij.x + psA->rij2.y * rpij.y + psA->rij2.z * rpij.z; 
+		        diff            = fabs(ld2 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance );
+                if (diff >= 1.0f)
+                {
+                    float acor  = (ld2 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij2sq);
+                    float3 dr;
+                    dr.x    = psA->rij2.x * acor;
+                    dr.y    = psA->rij2.y * acor;
+                    dr.z    = psA->rij2.z * acor;
+		            xpi.x  += dr.x * psA->InvMassI;
+		            xpi.y  += dr.y * psA->InvMassI;
+		            xpi.z  += dr.z * psA->InvMassI;
+		            xpj2.x -= dr.x * invMassJ;
+		            xpj2.y -= dr.y * invMassJ;
+		            xpj2.z -= dr.z * invMassJ;
+                    converged = false;
+                }
+            }
+            
+            if (atomID.w != -1)
+            {
+                rpij.x          = xpi.x - xpj3.x;
+                rpij.y          = xpi.y - xpj3.y;
+                rpij.z          = xpi.z - xpj3.z;
+		        rpsqij          = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
+		        rrpr            = psA->rij3.x * rpij.x + psA->rij3.y * rpij.y + psA->rij3.z * rpij.z; 
+		        diff            = fabs(ld3 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance );
+                if (diff >= 1.0f)
+                {
+                    float acor  = (ld3 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij3sq);
+                    float3 dr;
+                    dr.x    = psA->rij3.x * acor;
+                    dr.y    = psA->rij3.y * acor;
+                    dr.z    = psA->rij3.z * acor;
+		            xpi.x  += dr.x * psA->InvMassI;
+		            xpi.y  += dr.y * psA->InvMassI;
+		            xpi.z  += dr.z * psA->InvMassI;
+		            xpj3.x -= dr.x * invMassJ;
+		            xpj3.y -= dr.y * invMassJ;
+		            xpj3.z -= dr.z * invMassJ;
+                    converged = false;
+                }
+            }
+            iteration++;
+        }
+        
+        xpi.x += xi.x;
+        xpi.y += xi.y;
+        xpi.z += xi.z;
+        xpj1.x += xj1.x;
+        xpj1.y += xj1.y;
+        xpj1.z += xj1.z;
+        xpj2.x += xj2.x;
+        xpj2.y += xj2.y;
+        xpj2.z += xj2.z;
+        xpj3.x += xj3.x;
+        xpj3.y += xj3.y;
+        xpj3.z += xj3.z;
+
+        cSim.pPosq[atomID.x] = xpi;
+        cSim.pPosq[atomID.y] = xpj1;
+
+        if (atomID.z != -1)
+            cSim.pPosq[atomID.z] = xpj2;
+
+        if (atomID.w != -1)
+            cSim.pPosq[atomID.w] = xpj3;     
+   
+        pos += blockDim.x * gridDim.x;
+    }
+}
+
+__global__ void kApplyNoShake_kernel()
+{
+    unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
+    while (pos < cSim.NonShakeConstraints)
+    {
+        int  atomID          = cSim.pNonShakeID[pos];
+        float4 apos          = cSim.pOldPosq[atomID];
+        float4 xpi           = cSim.pPosq[atomID];
+        xpi.x               += apos.x;
+        xpi.y               += apos.y;
+        xpi.z               += apos.z;
+        cSim.pPosq[atomID]   = xpi;
+
+        pos += blockDim.x * gridDim.x;
+    }
+}
+
+void kCPUShake2(gpuContext gpu)
+{
+
+}
+
+void kApplySecondShake(gpuContext gpu)
+{
+  //  printf("kApplySecondShake\n");
+  //  kCPUShake2(gpu);
+    if (gpu->sim.ShakeConstraints > 0)
+    {
+        kApplySecondShake_kernel<<<gpu->sim.blocks, gpu->sim.shake_threads_per_block>>>();
+        LAUNCHERROR("kApplySecondShake");
+    }
+
+    // handle non-Shake atoms
+
+#ifdef DeltaShake
+    if (gpu->sim.NonShakeConstraints > 0)
+    {
+        //fprintf( gpu->log, "kApplyNoShake_kernel %d %d \n", gpu->sim.blocks, gpu->sim.nonshake_threads_per_block); fflush( gpu->log );
+        kApplyNoShake_kernel<<<gpu->sim.blocks, gpu->sim.nonshake_threads_per_block>>>();
+        LAUNCHERROR("kApplyNoShake");
+    }
+#endif
+
+}
+
--- a/platforms/cuda/src/kernels/kVerletUpdate.cu
+++ b/platforms/cuda/src/kernels/kVerletUpdate.cu
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#include <cuda.h>
+#include <vector_functions.h>
+#include <cstdlib>
+#include <string>
+#include <iostream>
+//#include <fstream>
+using namespace std;
+
+#include "gputypes.h"
+
+#define DeltaShake
+
+static __constant__ cudaGmxSimulation cSim;
+
+void SetVerletUpdateSim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
+}
+
+void GetVerletUpdateSim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
+}
+
+__global__ void kVerletUpdatePart1_kernel()
+{
+    unsigned int pos    = threadIdx.x + blockIdx.x * blockDim.x;
+    __syncthreads();
+    
+    while (pos < cSim.atoms)
+    {
+        float4 apos             = cSim.pPosq[pos];
+        float4 velocity         = cSim.pVelm4[pos];
+        float4 force            = cSim.pForce4[pos];
+        float dtOverMass        = cSim.deltaT*velocity.w;
+
+        cSim.pOldPosq[pos]      = apos;        
+        velocity.x             += dtOverMass*force.x;
+        velocity.y             += dtOverMass*force.y;
+        velocity.z             += dtOverMass*force.z;
+
+#ifndef DeltaShake
+        apos.x                 += velocity.x*cSim.deltaT;
+        apos.y                 += velocity.y*cSim.deltaT;
+        apos.z                 += velocity.z*cSim.deltaT;
+#else
+        apos.x                  = velocity.x*cSim.deltaT;
+        apos.y                  = velocity.y*cSim.deltaT;
+        apos.z                  = velocity.z*cSim.deltaT;
+#endif
+        cSim.pPosqP[pos]        = apos;
+        cSim.pVelm4[pos]        = velocity;        
+        pos                    += blockDim.x * gridDim.x;
+    }
+}
+
+__global__ void kVerletUpdatePart1CM_kernel()
+{
+    extern __shared__ float3 sCM[];
+    unsigned int pos    = threadIdx.x + blockIdx.x * blockDim.x;
+    float3 CM           = { 0.0f, 0.0f, 0.0f};
+    float4 CM1          = { 0.0f, 0.0f, 0.0f, 0.0f };
+    
+    // Read CM outputs from previous step
+    unsigned int cpos = threadIdx.x;
+    while (cpos < gridDim.x)
+    {
+        CM1             = cSim.pLinearMomentum[cpos];
+        CM.x           += CM1.x;
+        CM.y           += CM1.y;
+        CM.z           += CM1.z;
+        cpos           += blockDim.x;
+    }
+    sCM[threadIdx.x].x  = CM.x;
+    sCM[threadIdx.x].y  = CM.y;
+    sCM[threadIdx.x].z  = CM.z;
+    __syncthreads();
+    
+    // Reduce CM
+    unsigned int offset = 1;
+    unsigned int mask   = 1;
+    while (offset < blockDim.x)
+    {
+        if (((threadIdx.x & mask) == 0) && (threadIdx.x + offset < blockDim.x))
+        {
+            sCM[threadIdx.x].x += sCM[threadIdx.x + offset].x;
+            sCM[threadIdx.x].y += sCM[threadIdx.x + offset].y;
+            sCM[threadIdx.x].z += sCM[threadIdx.x + offset].z;
+        }
+        mask = 2 * mask + 1;
+        offset *= 2;
+        __syncthreads();
+    }       
+    
+    while (pos < cSim.atoms)
+    {
+        float4 apos             = cSim.pPosq[pos];
+        float4 velocity         = cSim.pVelm4[pos];
+        float4 force            = cSim.pForce4[pos];
+        float dtOverMass        = cSim.deltaT*velocity.w;
+
+        cSim.pOldPosq[pos]      = apos;        
+        velocity.x             += dtOverMass*force.x-sCM[0].x;
+        velocity.y             += dtOverMass*force.y-sCM[0].y;
+        velocity.z             += dtOverMass*force.z-sCM[0].z;
+
+#ifndef DeltaShake
+        apos.x                 += velocity.x*cSim.deltaT;
+        apos.y                 += velocity.y*cSim.deltaT;
+        apos.z                 += velocity.z*cSim.deltaT;
+#else
+        apos.x                  = velocity.x*cSim.deltaT;
+        apos.y                  = velocity.y*cSim.deltaT;
+        apos.z                  = velocity.z*cSim.deltaT;
+#endif
+
+        cSim.pPosqP[pos]        = apos;
+        cSim.pVelm4[pos]        = velocity;        
+        pos                    += blockDim.x * gridDim.x;
+    }
+}
+
+void kVerletUpdatePart1(gpuContext gpu)
+{
+//    printf("kVerletUpdatePart1\n");
+    if (gpu->bRemoveCM)
+    {
+        kVerletUpdatePart1CM_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block, gpu->sim.update_threads_per_block * sizeof(float3)>>>();
+        LAUNCHERROR("kVerletUpdatePart1CM");
+        gpu->bRemoveCM = false;
+    }
+    else
+    {    
+        kVerletUpdatePart1_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
+        LAUNCHERROR("kVerletUpdatePart1");
+    }
+}
+
+__global__ void kVerletUpdatePart2_kernel()
+{
+    unsigned int pos            = threadIdx.x + blockIdx.x * blockDim.x;
+    __syncthreads();
+    
+    while (pos < cSim.atoms)
+    {
+        float4 velocity         = cSim.pVelm4[pos];
+        float4 apos             = cSim.pPosq[pos];
+        float4 xPrime           = cSim.pPosqP[pos];
+
+#ifndef DeltaShake
+        velocity.x              = cSim.oneOverDeltaT*(xPrime.x-apos.x);
+        velocity.y              = cSim.oneOverDeltaT*(xPrime.y-apos.y);
+        velocity.z              = cSim.oneOverDeltaT*(xPrime.z-apos.z);
+#else
+        velocity.x              = cSim.oneOverDeltaT*(xPrime.x);
+        velocity.y              = cSim.oneOverDeltaT*(xPrime.y);
+        velocity.z              = cSim.oneOverDeltaT*(xPrime.z);
+
+        xPrime.x               += apos.x;
+        xPrime.y               += apos.y;
+        xPrime.z               += apos.z;
+#endif
+        cSim.pPosq[pos]         = xPrime;
+        cSim.pVelm4[pos]        = velocity;
+         
+        pos                    += blockDim.x * gridDim.x;    
+    }
+}
+
+__global__ void kVerletUpdatePart2CM_kernel()
+{
+    extern __shared__ float3 sCM[];
+    unsigned int pos            = threadIdx.x + blockIdx.x * blockDim.x;
+    float3 CM                   = {0.0f, 0.0f, 0.0f};
+    __syncthreads();
+    
+    while (pos < cSim.atoms)
+    {
+        float4 velocity         = cSim.pVelm4[pos];
+        float4 apos             = cSim.pPosq[pos];
+        float4 xPrime           = cSim.pPosqP[pos];
+        float mass              = 1.0f / velocity.w;
+
+#ifndef DeltaShake
+        velocity.x              = cSim.oneOverDeltaT*(xPrime.x-apos.x);
+        velocity.y              = cSim.oneOverDeltaT*(xPrime.y-apos.y);
+        velocity.z              = cSim.oneOverDeltaT*(xPrime.z-apos.z);
+#else
+        velocity.x              = cSim.oneOverDeltaT*(xPrime.x);
+        velocity.y              = cSim.oneOverDeltaT*(xPrime.y);
+        velocity.z              = cSim.oneOverDeltaT*(xPrime.z);
+
+        xPrime.x               += apos.x;
+        xPrime.y               += apos.y;
+        xPrime.z               += apos.z;
+#endif
+
+        CM.x                   += mass * velocity.x;
+        CM.y                   += mass * velocity.y;
+        CM.z                   += mass * velocity.z;
+        cSim.pPosq[pos]         = xPrime;
+        cSim.pVelm4[pos]        = velocity;
+         
+        pos                    += blockDim.x * gridDim.x;    
+    }
+    
+    // Scale CM
+    CM.x *= cSim.inverseTotalMass;
+    CM.y *= cSim.inverseTotalMass;
+    CM.z *= cSim.inverseTotalMass;
+    sCM[threadIdx.x] = CM;
+    __syncthreads();
+    
+    // Reduce CM for CTA
+    unsigned int offset = 1;
+    unsigned int mask   = 1;
+    while (offset < blockDim.x)
+    {
+        if (((threadIdx.x & mask) == 0) && (threadIdx.x + offset < blockDim.x))
+        {
+            sCM[threadIdx.x].x += sCM[threadIdx.x + offset].x;
+            sCM[threadIdx.x].y += sCM[threadIdx.x + offset].y;
+            sCM[threadIdx.x].z += sCM[threadIdx.x + offset].z;
+        }
+        mask = 2 * mask + 1;
+        offset *= 2;
+        __syncthreads();
+    }
+    if (threadIdx.x == 0)
+    {
+        float4 CM;
+        CM.x                                = sCM[0].x;
+        CM.y                                = sCM[0].y;
+        CM.z                                = sCM[0].z;
+        CM.w                                = 0.0f;
+        cSim.pLinearMomentum[blockIdx.x]    = CM;
+    }  
+}
+
+void kVerletUpdatePart2(gpuContext gpu)
+{
+//    printf("kVerletUpdatePart2\n");
+    if (gpu->bCalculateCM)
+    {
+        kVerletUpdatePart2CM_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block, gpu->sim.update_threads_per_block * sizeof(float3)>>>();
+        LAUNCHERROR("kVerletUpdatePart2CM");
+        gpu->bCalculateCM = false;
+        gpu->bRemoveCM = true;
+    }
+    else
+    {
+        kVerletUpdatePart2_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
+        LAUNCHERROR("kVerletUpdatePart2");
+    }
+}
+