Checked in Cuda code

38f6c8f8 · Peter Eastman · 95d79181 · 38f6c8f8 · 38f6c8f8 · 38f6c8f8
Commit 38f6c8f8 authored Jan 27, 2009 by Peter Eastman
20 changed files
--- a/platforms/cuda/src/kernels/cudaKernels.h
+++ b/platforms/cuda/src/kernels/cudaKernels.h
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+#include "gputypes.h"
+// Initialization
+extern void kClearForces(gpuContext gpu);
+extern void kCalculateObcGbsaBornSum(gpuContext gpu);
+extern void kReduceObcGbsaBornSum(gpuContext gpu);
+extern void kGenerateRandoms(gpuContext gpu);
+// Main loop
+extern void kCalculateCDLJObcGbsaForces1(gpuContext gpu);
+extern void kCalculateCDLJObcGbsaForces1_12(gpuContext gpu);
+extern void kCalculateCDLJForces(gpuContext gpu);
+extern void kCalculateCDLJForces_12(gpuContext gpu);
+extern void kCalculateObcGbsaForces1(gpuContext gpu);
+extern void kCalculateObcGbsaForces1_12(gpuContext gpu);
+extern void kReduceObcGbsaBornForces(gpuContext gpu);
+extern void kCalculateObcGbsaForces2(gpuContext gpu);
+extern void kCalculateObcGbsaForces2_12(gpuContext gpu);
+extern void kCalculateLocalForces(gpuContext gpu);
+extern void kCalculateAndersenThermostat(gpuContext gpu);
+extern void kReduceBornSumAndForces(gpuContext gpu);
+extern void kUpdatePart1(gpuContext gpu);
+extern void kApplyFirstShake(gpuContext gpu);
+extern void kUpdatePart2(gpuContext gpu);
+extern void kApplySecondShake(gpuContext gpu);
+extern void kVerletUpdatePart1(gpuContext gpu);
+extern void kVerletUpdatePart2(gpuContext gpu);
+extern void kBrownianUpdatePart1(gpuContext gpu);
+extern void kBrownianUpdatePart2(gpuContext gpu);
+// Extras
+extern void kReduceForces(gpuContext gpu);
+extern void kClearBornForces(gpuContext gpu);
+// Initializers
+extern void SetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu);
+extern void GetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu);
+extern void SetCalculateCDLJObcGbsaForces1_12Sim(gpuContext gpu);
+extern void GetCalculateCDLJObcGbsaForces1_12Sim(gpuContext gpu);
+extern void SetCalculateCDLJForcesSim(gpuContext gpu);
+extern void GetCalculateCDLJForcesSim(gpuContext gpu);
+extern void SetCalculateCDLJForces_12Sim(gpuContext gpu);
+extern void GetCalculateCDLJForces_12Sim(gpuContext gpu);
+extern void SetCalculateLocalForcesSim(gpuContext gpu);
+extern void GetCalculateLocalForcesSim(gpuContext gpu);
+extern void SetCalculateObcGbsaBornSumSim(gpuContext gpu);
+extern void GetCalculateObcGbsaBornSumSim(gpuContext gpu);
+extern void SetCalculateObcGbsaForces1Sim(gpuContext gpu);
+extern void GetCalculateObcGbsaForces1Sim(gpuContext gpu);
+extern void SetCalculateObcGbsaForces1_12Sim(gpuContext gpu);
+extern void GetCalculateObcGbsaForces1_12Sim(gpuContext gpu);
+extern void SetCalculateObcGbsaForces2Sim(gpuContext gpu);
+extern void GetCalculateObcGbsaForces2Sim(gpuContext gpu);
+extern void SetCalculateObcGbsaForces2_12Sim(gpuContext gpu);
+extern void GetCalculateObcGbsaForces2_12Sim(gpuContext gpu);
+extern void SetCalculateAndersenThermostatSim(gpuContext gpu);
+extern void GetCalculateAndersenThermostatSim(gpuContext gpu);
+extern void SetForcesSim(gpuContext gpu);
+extern void GetForcesSim(gpuContext gpu);
+extern void SetUpdateShakeHSim(gpuContext gpu);
+extern void GetUpdateShakeHSim(gpuContext gpu);
+extern void SetVerletUpdateSim(gpuContext gpu);
+extern void GetVerletUpdateSim(gpuContext gpu);
+extern void SetBrownianUpdateSim(gpuContext gpu);
+extern void GetBrownianUpdateSim(gpuContext gpu);
+extern void SetRandomSim(gpuContext gpu);
+extern void GetRandomSim(gpuContext gpu);
--- a/platforms/cuda/src/kernels/cudatypes.h
+++ b/platforms/cuda/src/kernels/cudatypes.h
--- a/platforms/cuda/src/kernels/gpu.cpp
+++ b/platforms/cuda/src/kernels/gpu.cpp
--- a/platforms/cuda/src/kernels/gputypes.h
+++ b/platforms/cuda/src/kernels/gputypes.h
+#ifndef __GPUTYPES_H__
+#define __GPUTYPES_H__
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+#include "cudatypes.h"
+#include <vector>
+struct gpuAtomType {
+    string name;
+    char symbol;
+    float r;
+};
+enum SM_VERSION
+{
+    SM_10,
+    SM_11,
+    SM_12
+};
+/* Pointer to this structure will be given 
+ * to gromacs functions*/
+struct _gpuContext {
+    //Cache this here so that it doesn't
+    //have to be repeatedly passed around
+    int natoms;
+    gpuAtomType* gpAtomTable;
+    int gAtomTypes;
+    cudaGmxSimulation sim;
+    unsigned int* pOutputBufferCounter;
+    unsigned int* pExclusion;
+    unsigned char* pAtomSymbol;
+    float iterations;
+    float epsfac;
+    float solventDielectric;
+    float soluteDielectric;
+    int grid;
+    bool bCalculateCM;
+    bool bRemoveCM;
+	 bool bRecalculateBornRadii;
+    unsigned long seed;
+    SM_VERSION sm_version;
+    CUDAStream<float4>* psPosq4;
+    CUDAStream<float4>* psPosqP4;
+    CUDAStream<float4>* psOldPosq4;
+    CUDAStream<float4>* psVelm4;
+    CUDAStream<float4>* psForce4;
+    CUDAStream<float4>* psxVector4;
+    CUDAStream<float4>* psvVector4;
+    CUDAStream<float2>* psSigEps2; 
+    CUDAStream<float2>* psObcData; 
+    CUDAStream<float>* psObcChain;
+    CUDAStream<float>* psBornForce;
+    CUDAStream<float>* psBornRadii;
+    CUDAStream<float>* psBornSum;
+    CUDAStream<int4>* psBondID;
+    CUDAStream<float2>* psBondParameter;
+    CUDAStream<int4>* psBondAngleID1;
+    CUDAStream<int2>* psBondAngleID2;
+    CUDAStream<float2>* psBondAngleParameter;
+    CUDAStream<int4>* psDihedralID1;
+    CUDAStream<int4>* psDihedralID2;
+    CUDAStream<float4>* psDihedralParameter;
+    CUDAStream<int4>* psRbDihedralID1;
+    CUDAStream<int4>* psRbDihedralID2;
+    CUDAStream<float4>* psRbDihedralParameter1;
+    CUDAStream<float2>* psRbDihedralParameter2;
+    CUDAStream<int4>* psLJ14ID;
+    CUDAStream<float4>* psLJ14Parameter;
+    CUDAStream<int>* psNonShakeID;
+    CUDAStream<int4>* psShakeID;
+    CUDAStream<float4>* psShakeParameter;
+    CUDAStream<unsigned int>* psExclusion;
+    CUDAStream<unsigned int>* psWorkUnit;
+    CUDAStream<float4>* psRandom4;          // Pointer to sets of 4 random numbers for MD integration
+    CUDAStream<float2>* psRandom2;          // Pointer to sets of 2 random numbers for MD integration
+    CUDAStream<uint4>* psRandomSeed;        // Pointer to each random seed
+    CUDAStream<int>* psRandomPosition;      // Pointer to random number positions
+    CUDAStream<float4>* psLinearMomentum;   // Pointer to total linear momentum per CTA
+};
+typedef struct _gpuContext *gpuContext;
+// Function prototypes
+extern "C"
+bool gpuIsAvailable();
+extern "C"
+int gpuReadBondParameters(gpuContext gpu, char* fname);
+extern "C"
+void gpuSetBondParameters(gpuContext gpu, const std::vector<int>& atom1, const std::vector<int>& atom2, const std::vector<float>& length, const std::vector<float>& k);
+extern "C"
+int gpuReadBondAngleParameters(gpuContext gpu, char* fname);
+extern "C"
+void gpuSetBondAngleParameters(gpuContext gpu, const std::vector<int>& atom1, const std::vector<int>& atom2, const std::vector<int>& atom3,
+        const std::vector<float>& angle, const std::vector<float>& k);
+extern "C"
+int gpuReadDihedralParameters(gpuContext gpu, char* fname);
+extern "C"
+void gpuSetDihedralParameters(gpuContext gpu, const std::vector<int>& atom1, const std::vector<int>& atom2, const std::vector<int>& atom3, const std::vector<int>& atom4,
+        const std::vector<float>& k, const std::vector<float>& phase, const std::vector<int>& periodicity);
+extern "C"
+int gpuReadRbDihedralParameters(gpuContext gpu, char* fname);
+extern "C"
+void gpuSetRbDihedralParameters(gpuContext gpu, const std::vector<int>& atom1, const std::vector<int>& atom2, const std::vector<int>& atom3, const std::vector<int>& atom4,
+        const std::vector<float>& c0, const std::vector<float>& c1, const std::vector<float>& c2, const std::vector<float>& c3, const std::vector<float>& c4, const std::vector<float>& c5);
+extern "C"
+int gpuReadLJ14Parameters(gpuContext gpu, char* fname);
+extern "C"
+void gpuSetLJ14Parameters(gpuContext gpu, float epsfac, float fudge, const std::vector<int>& atom1, const std::vector<int>& atom2,
+        const std::vector<float>& c6, const std::vector<float>& c12, const std::vector<float>& q1, const std::vector<float>& q2);
+extern "C"
+float gpuGetAtomicRadius(gpuContext gpu, string s);
+extern "C"
+unsigned char gpuGetAtomicSymbol(gpuContext gpu, string s);
+extern "C"
+int gpuReadAtomicParameters(gpuContext gpu, char* fname);
+extern "C"
+int gpuReadCoulombParameters(gpuContext gpu, char* fname);
+extern "C"
+void gpuSetCoulombParameters(gpuContext gpu, float epsfac, const std::vector<int>& atom, const std::vector<float>& c6, const std::vector<float>& c12, const std::vector<float>& q,
+        const std::vector<char>& symbol, const std::vector<vector<int> >& exclusions);
+extern "C"
+void gpuSetObcParameters(gpuContext gpu, float innerDielectric, float solventDielectric, const std::vector<int>& atom, const std::vector<float>& radius, const std::vector<float>& scale);
+extern "C"
+int gpuReadShakeParameters(gpuContext gpu, char* fname);
+extern "C"
+void gpuSetShakeParameters(gpuContext gpu, const std::vector<int>& atom1, const std::vector<int>& atom2, const std::vector<float>& distance,
+        const std::vector<float>& invMass1, const std::vector<float>& invMass2, float tolerance);
+extern "C"
+int gpuAllocateInitialBuffers(gpuContext gpu);
+extern "C"
+void gpuReadCoordinates(gpuContext gpu, char* fname);
+extern "C"
+void gpuSetPositions(gpuContext gpu, const std::vector<float>& x, const std::vector<float>& y, const std::vector<float>& z);
+extern "C"
+void gpuSetVelocities(gpuContext gpu, const std::vector<float>& x, const std::vector<float>& y, const std::vector<float>& z);
+extern "C"
+void gpuSetMass(gpuContext gpu, const std::vector<float>& mass);
+extern "C"
+void gpuInitializeRandoms(gpuContext gpu);
+extern "C"
+void* gpuInitFromFile(char* fname);
+extern "C"
+void* gpuInit(int numAtoms);
+extern "C"
+void gpuSetIntegrationParameters(gpuContext gpu, float tau, float deltaT, float temperature);
+extern "C"
+void gpuSetVerletIntegrationParameters(gpuContext gpu, float deltaT);
+extern "C"
+void gpuSetBrownianIntegrationParameters(gpuContext gpu, float tau, float deltaT, float temperature);
+extern "C"
+void gpuSetAndersenThermostatParameters(gpuContext gpu, float temperature, float collisionProbability);
+extern "C"
+void gpuShutDown(gpuContext gpu);
+extern "C"
+int gpuBuildOutputBuffers(gpuContext gpu);
+extern "C"
+int gpuBuildThreadBlockWorkList(gpuContext gpu);
+extern "C"
+int gpuBuildExclusionList(gpuContext gpu);
+extern "C"
+int gpuSetConstants(gpuContext gpu);
+extern "C"
+void gpuDumpCoordinates(gpuContext gpu);
+extern "C"
+void gpuDumpPrimeCoordinates(gpuContext gpu);
+extern "C"
+void gpuDumpForces(gpuContext gpu);
+extern "C"
+void gpuDumpAtomData(gpuContext gpu);
+extern "C"
+bool gpuCheckData(gpuContext gpu);
+extern "C"
+void gpuSetup(void* pVoid);
+extern "C"
+void kCPUCalculate14(gpuContext gpu);
+extern "C"
+void kCPUCalculateLocalForces(gpuContext gpu);
+extern "C"
+void WriteArrayToFile1( gpuContext gpu, char* fname, int step, CUDAStream<float>*  psPos, int numPrint );
+extern "C"
+void WriteArrayToFile2( gpuContext gpu, char* fname, int step, CUDAStream<float2>* psPos, int numPrint );
+extern "C"
+void WriteArrayToFile3( gpuContext gpu, char* fname, int step, CUDAStream<float3>* psPos, int numPrint );
+extern "C"
+void WriteArrayToFile4( gpuContext gpu, char* fname, int step, CUDAStream<float4>* psPos, int numPrint );
+extern "C"
+void gpuDumpObcInfo(gpuContext gpu);
+extern "C"
+void gpuDumpObcLoop1(gpuContext gpu); 
+#endif //__GPUTYPES_H__
--- a/platforms/cuda/src/kernels/kBrownianUpdate.cu
+++ b/platforms/cuda/src/kernels/kBrownianUpdate.cu
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+#include <stdio.h>
+#include <cuda.h>
+#include <vector_functions.h>
+#include <cstdlib>
+#include <string>
+#include <iostream>
+//#include <fstream>
+using namespace std;
+#include "gputypes.h"
+#define DeltaShake
+static __constant__ cudaGmxSimulation cSim;
+void SetBrownianUpdateSim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
+}
+void GetBrownianUpdateSim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
+}
+__global__ void kBrownianUpdatePart1_kernel()
+{
+    unsigned int pos    = threadIdx.x + blockIdx.x * blockDim.x;
+    unsigned int rpos   = cSim.pRandomPosition[blockIdx.x];
+    __syncthreads();
+    while (pos < cSim.atoms)
+    {
+        float4 random4a         = cSim.pRandom4a[rpos + pos];
+        float4 apos             = cSim.pPosq[pos];
+        float4 force            = cSim.pForce4[pos];
+        cSim.pOldPosq[pos]      = apos;
+#ifndef DeltaShake
+        apos.x                 += force.x*cSim.GDT + random4a.x;
+        apos.y                 += force.y*cSim.GDT + random4a.y;
+        apos.z                 += force.z*cSim.GDT + random4a.z;
+#else
+        apos.x                  = force.x*cSim.GDT + random4a.x;
+        apos.y                  = force.y*cSim.GDT + random4a.y;
+        apos.z                  = force.z*cSim.GDT + random4a.z;
+#endif
+        cSim.pPosqP[pos]        = apos;
+        pos                    += blockDim.x * gridDim.x;
+    }
+}
+void kBrownianUpdatePart1(gpuContext gpu)
+{
+//    printf("kBrownianUpdatePart1\n");
+    kBrownianUpdatePart1_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
+    LAUNCHERROR("kBrownianUpdatePart1");
+}
+__global__ void kBrownianUpdatePart2_kernel()
+{
+    unsigned int pos            = threadIdx.x + blockIdx.x * blockDim.x;
+    unsigned int rpos           = cSim.pRandomPosition[blockIdx.x];
+    __syncthreads();
+    while (pos < cSim.atoms)
+    {
+        float4 velocity         = cSim.pVelm4[pos];
+        float4 apos             = cSim.pPosq[pos];
+        float4 xPrime           = cSim.pPosqP[pos];
+#ifndef DeltaShake
+        velocity.x              = cSim.oneOverDeltaT*(xPrime.x-apos.x);
+        velocity.y              = cSim.oneOverDeltaT*(xPrime.y-apos.y);
+        velocity.z              = cSim.oneOverDeltaT*(xPrime.z-apos.z);
+#else
+        velocity.x              = cSim.oneOverDeltaT*(xPrime.x);
+        velocity.y              = cSim.oneOverDeltaT*(xPrime.y);
+        velocity.z              = cSim.oneOverDeltaT*(xPrime.z);
+        xPrime.x               += apos.x;
+        xPrime.y               += apos.y;
+        xPrime.z               += apos.z;
+#endif
+        cSim.pPosq[pos]         = xPrime;
+        cSim.pVelm4[pos]        = velocity;
+        pos                    += blockDim.x * gridDim.x;    
+    }
+    // Update random position pointer
+    if (threadIdx.x == 0)
+    {
+        rpos                   += cSim.paddedNumberOfAtoms;
+        if (rpos > cSim.randoms)
+            rpos               -= cSim.randoms;
+        cSim.pRandomPosition[blockIdx.x] = rpos;
+    }
+}
+extern void kGenerateRandoms(gpuContext gpu);
+void kBrownianUpdatePart2(gpuContext gpu)
+{
+//    printf("kBrownianUpdatePart2\n");
+    kBrownianUpdatePart2_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
+    LAUNCHERROR("kBrownianUpdatePart2");
+    // Update randoms if necessary
+    static int iteration = 0;
+    iteration++;
+    if (iteration == gpu->sim.randomIterations)
+    {
+        kGenerateRandoms(gpu);
+        iteration = 0;
+    }
+}
--- a/platforms/cuda/src/kernels/kCalculateAndersenThermostat.cu
+++ b/platforms/cuda/src/kernels/kCalculateAndersenThermostat.cu
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+#include <stdio.h>
+#include <cuda.h>
+#include <vector_functions.h>
+#include <cstdlib>
+#include <string>
+#include <iostream>
+//#include <fstream>
+using namespace std;
+#include "gputypes.h"
+static __constant__ cudaGmxSimulation cSim;
+void SetCalculateAndersenThermostatSim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
+}
+void GetCalculateAndersenThermostatSim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
+}
+__global__ void kCalculateAndersenThermostat_kernel()
+{
+    unsigned int pos            = threadIdx.x + blockIdx.x * blockDim.x;
+    unsigned int rpos           = cSim.pRandomPosition[blockIdx.x];
+    __syncthreads();
+    while (pos < cSim.atoms)
+    {
+        float4 velocity         = cSim.pVelm4[pos];
+        float4 random4a         = cSim.pRandom4a[rpos + pos];
+        float scale = (random4a.w < cSim.collisionProbability ? 0.0 : 1.0);
+        float add = (1.0-scale)*sqrt(cSim.kT*velocity.w);
+        velocity.x = scale*velocity.x + add*random4a.x;
+        velocity.y = scale*velocity.y + add*random4a.y;
+        velocity.z = scale*velocity.z + add*random4a.z;
+        cSim.pVelm4[pos]        = velocity;
+        pos                    += blockDim.x * gridDim.x;    
+    }
+    // Update random position pointer
+    if (threadIdx.x == 0)
+    {
+        rpos                   += cSim.paddedNumberOfAtoms;
+        if (rpos > cSim.randoms)
+            rpos               -= cSim.randoms;
+        cSim.pRandomPosition[blockIdx.x] = rpos;
+    }
+}
+extern void kGenerateRandoms(gpuContext gpu);
+void kCalculateAndersenThermostat(gpuContext gpu)
+{
+//    printf("kCalculateAndersenThermostat\n");
+    kCalculateAndersenThermostat_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
+    LAUNCHERROR("kCalculateAndersenThermostat");
+    // Update randoms if necessary
+    static int iteration = 0;
+    iteration++;
+    if (iteration == gpu->sim.randomIterations)
+    {
+        kGenerateRandoms(gpu);
+        iteration = 0;
+    }
+}
--- a/platforms/cuda/src/kernels/kCalculateCDLJForces.cu
+++ b/platforms/cuda/src/kernels/kCalculateCDLJForces.cu
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+#include <stdio.h>
+#include <cuda.h>
+#include <vector_functions.h>
+#include <cstdlib>
+#include <string>
+#include <iostream>
+#include <fstream>
+using namespace std;
+#include "gputypes.h"
+#include "cudatypes.h"
+#define UNROLLXX 0
+#define UNROLLXY 0
+struct Atom {
+    float x;
+    float y;
+    float z;
+    float q;
+    float sig;
+    float eps;
+    float fx;
+    float fy;
+    float fz;
+    float eps2;
+    float sig2;
+};
+__shared__ Atom sA[G8X_NONBOND_THREADS_PER_BLOCK];
+__shared__ unsigned int sWorkUnit[G8X_NONBOND_WORKUNITS_PER_SM];
+__shared__ unsigned int sNext[GRID];
+static __constant__ cudaGmxSimulation cSim;
+void SetCalculateCDLJForcesSim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
+}
+void GetCalculateCDLJForcesSim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
+}
+__global__ void kCalculateCDLJForces_kernel()
+{
+    // Read queue of work blocks once so the remainder of
+    // kernel can run asynchronously    
+    int pos = cSim.nbWorkUnitsPerBlock * blockIdx.x + min(blockIdx.x, cSim.nbWorkUnitsPerBlockRemainder);
+    int end = cSim.nbWorkUnitsPerBlock * (blockIdx.x + 1) + min((blockIdx.x + 1), cSim.nbWorkUnitsPerBlockRemainder);    
+    if (threadIdx.x < end - pos)
+    {
+        sWorkUnit[threadIdx.x] = cSim.pWorkUnit[pos + threadIdx.x];
+    }
+    if (threadIdx.x < GRID)
+    {
+        sNext[threadIdx.x] = (threadIdx.x + 1) & (GRID - 1);
+    }
+    __syncthreads();
+    // Now change pos and end to reflect work queue just read
+    // into shared memory
+    end = end - pos; 
+    pos = end - (threadIdx.x >> GRIDBITS) - 1;
+    while (pos >= 0)
+    {  
+        // Extract cell coordinates from appropriate work unit
+        unsigned int x = sWorkUnit[pos];
+        unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
+        bool bExclusionFlag = (x & 0x1);
+        x = (x >> 17) << GRIDBITS;
+        float4      apos;   // Local atom x, y, z, q
+        float3      af;     // Local atom fx, fy, fz
+        float dx; 
+        float dy; 
+        float dz; 
+        float r2; 
+        float invR; 
+        float sig; 
+        float sig2; 
+        float sig6; 
+        float eps; 
+        float dEdR;  
+        unsigned int tgx = threadIdx.x & (GRID - 1);
+        unsigned int tbx = threadIdx.x - tgx;
+        int tj = tgx; 
+        Atom* psA = &sA[tbx];
+        if (!bExclusionFlag)
+        {
+            if (x == y) // Handle diagonals uniquely at 50% efficiency
+            { 
+                // Read fixed atom data into registers and GRF
+                unsigned int i      = x + tgx;
+                apos                = cSim.pPosq[i];
+                float2 a            = cSim.pAttr[i];
+                sA[threadIdx.x].x   = apos.x;
+                sA[threadIdx.x].y   = apos.y;
+                sA[threadIdx.x].z   = apos.z;
+                sA[threadIdx.x].q   = apos.w;
+                sA[threadIdx.x].sig = a.x;
+                sA[threadIdx.x].eps = a.y;
+                af.x                = 0.0f;
+                af.y                = 0.0f;
+                af.z                = 0.0f;
+                apos.w             *= cSim.epsfac;
+                for (unsigned int j = 0; j < GRID; j++)
+                {
+                    dx              = psA[j].x - apos.x; 
+                    dy              = psA[j].y - apos.y; 
+                    dz              = psA[j].z - apos.z; 
+                    r2              = dx * dx + dy * dy + dz * dz; 
+                    invR            = 1.0f / sqrt(r2);
+                    sig             = a.x + psA[j].sig; 
+                    sig2            = invR * sig; 
+                    sig2           *= sig2;
+                    sig6            = sig2 * sig2 * sig2; 
+                    eps             = a.y * psA[j].eps; 
+                    dEdR            = eps * (12.0f * sig6 - 6.0f) * sig6; 
+                    dEdR           += apos.w * psA[j].q * invR; 
+                    dEdR           *= invR * invR; 
+                    dx             *= dEdR; 
+                    dy             *= dEdR; 
+                    dz             *= dEdR; 
+                    af.x           -= dx; 
+                    af.y           -= dy; 
+                    af.z           -= dz; 
+                }
+                // Write results
+                float4 of;
+                of.x                                = af.x;
+                of.y                                = af.y;
+                of.z                                = af.z;
+                of.w                                = 0.0f;
+                int offset                          = x + tgx + (x >> GRIDBITS) * cSim.stride;
+                cSim.pForce4a[offset]               = of;
+            }         
+            else        // 100% utilization
+            {
+                // Read fixed atom data into registers and GRF
+                int j                   = y + tgx;
+                unsigned int i          = x + tgx;
+                float4 temp             = cSim.pPosq[j];
+                float2 temp1            = cSim.pAttr[j];
+                apos                    = cSim.pPosq[i];
+                float2 a                = cSim.pAttr[i];
+                sA[threadIdx.x].x       = temp.x;
+                sA[threadIdx.x].y       = temp.y;
+                sA[threadIdx.x].z       = temp.z;
+                sA[threadIdx.x].q       = temp.w;
+                sA[threadIdx.x].sig     = temp1.x;
+                sA[threadIdx.x].eps     = temp1.y;
+                sA[threadIdx.x].fx      = af.x = 0.0f;
+                sA[threadIdx.x].fy      = af.y = 0.0f;
+                sA[threadIdx.x].fz      = af.z = 0.0f;
+                sA[threadIdx.x].sig2    = a.x;
+                sA[threadIdx.x].eps2    = a.y;
+                apos.w                 *= cSim.epsfac;
+                for (j = 0; j < GRID; j++)
+                {
+                    dx              = psA[tj].x - apos.x; 
+                    dy              = psA[tj].y - apos.y; 
+                    dz              = psA[tj].z - apos.z; 
+                    r2              = dx * dx + dy * dy + dz * dz; 
+                    invR            = 1.0f / sqrt(r2);
+                    sig             = a.x + psA[tj].sig; 
+                    sig2            = invR * sig; 
+                    sig2           *= sig2;
+                    sig6            = sig2 * sig2 * sig2; 
+                    eps             = a.y * psA[tj].eps; 
+                    dEdR            = eps * (12.0f * sig6 - 6.0f) * sig6; 
+                    dEdR           += apos.w * psA[tj].q * invR; 
+                    dEdR           *= invR * invR; 
+                    dx             *= dEdR; 
+                    dy             *= dEdR; 
+                    dz             *= dEdR; 
+                    af.x           -= dx; 
+                    af.y           -= dy; 
+                    af.z           -= dz; 
+                    psA[tj].fx     += dx; 
+                    psA[tj].fy     += dy; 
+                    psA[tj].fz     += dz;
+                    tj              = sNext[tj]; 
+                }
+                // Write results
+                float4 of;
+                of.x                                = af.x;
+                of.y                                = af.y;
+                of.z                                = af.z;
+                of.w                                = 0.0f;
+                int offset                          = x + tgx + (y >> GRIDBITS) * cSim.stride;
+                cSim.pForce4a[offset]               = of;
+                of.x                                = sA[threadIdx.x].fx;
+                of.y                                = sA[threadIdx.x].fy;
+                of.z                                = sA[threadIdx.x].fz;
+                offset                              = y + tgx + (x >> GRIDBITS) * cSim.stride;
+                cSim.pForce4a[offset]               = of;
+            }
+        }
+        else  // bExclusion
+        {
+            // Read exclusion data
+            if (x == y) // Handle diagonals uniquely at 50% efficiency
+            { 
+                // Read fixed atom data into registers and GRF
+                unsigned int excl       = cSim.pExclusion[x * cSim.exclusionStride + y + tgx];                          
+                unsigned int i          = x + tgx;
+                apos                    = cSim.pPosq[i];
+                float2 a                = cSim.pAttr[i];
+                sA[threadIdx.x].x       = apos.x;
+                sA[threadIdx.x].y       = apos.y;
+                sA[threadIdx.x].z       = apos.z;
+                sA[threadIdx.x].q       = apos.w;
+                sA[threadIdx.x].sig     = a.x;
+                sA[threadIdx.x].eps     = a.y;
+                af.x                    = 0.0f;
+                af.y                    = 0.0f;
+                af.z                    = 0.0f;
+                sA[threadIdx.x].sig2    = a.x;
+                sA[threadIdx.x].eps2    = a.y;
+                apos.w                 *= cSim.epsfac;
+                for (unsigned int j = 0; j < GRID; j++)
+                {
+                    dx              = psA[j].x - apos.x; 
+                    dy              = psA[j].y - apos.y; 
+                    dz              = psA[j].z - apos.z; 
+                    r2              = dx * dx + dy * dy + dz * dz; 
+                    invR            = 1.0f / sqrt(r2);
+                    sig             = psA[tgx].sig2 + psA[j].sig; 
+                    sig2            = invR * sig; 
+                    sig2           *= sig2;
+                    sig6            = sig2 * sig2 * sig2; 
+                    eps             = psA[tgx].eps2 * psA[j].eps; 
+                    dEdR            = eps * (12.0f * sig6 - 6.0f) * sig6; 
+                    dEdR           += apos.w * psA[j].q * invR; 
+                    dEdR           *= invR * invR; 
+                    if (!(excl & 0x1))
+                    {
+                        dEdR = 0.0f;
+                    }
+                    dx             *= dEdR; 
+                    dy             *= dEdR; 
+                    dz             *= dEdR; 
+                    af.x           -= dx; 
+                    af.y           -= dy; 
+                    af.z           -= dz;
+                    excl          >>= 1;               
+                }
+                // Write results
+                float4 of;
+                of.x                                = af.x;
+                of.y                                = af.y;
+                of.z                                = af.z;
+                of.w                                = 0.0f;
+                int offset                          = x + tgx + (x >> GRIDBITS) * cSim.stride;
+                cSim.pForce4a[offset]               = of;
+            }         
+            else        // 100% utilization
+            {
+                // Read fixed atom data into registers and GRF        
+                unsigned int excl       = cSim.pExclusion[x * cSim.exclusionStride + y + tgx];
+                excl                    = (excl >> tgx) | (excl << (GRID - tgx));
+                int j                   = y + tgx;
+                unsigned int i          = x + tgx;
+                float4 temp             = cSim.pPosq[j];
+                float2 temp1            = cSim.pAttr[j];
+                apos                    = cSim.pPosq[i];
+                float2 a                = cSim.pAttr[i];
+                sA[threadIdx.x].x       = temp.x;
+                sA[threadIdx.x].y       = temp.y;
+                sA[threadIdx.x].z       = temp.z;
+                sA[threadIdx.x].q       = temp.w;
+                sA[threadIdx.x].sig     = temp1.x;
+                sA[threadIdx.x].eps     = temp1.y;
+                sA[threadIdx.x].fx      = af.x = 0.0f;
+                sA[threadIdx.x].fy      = af.y = 0.0f;
+                sA[threadIdx.x].fz      = af.z = 0.0f;
+                sA[threadIdx.x].sig2    = a.x;
+                sA[threadIdx.x].eps2    = a.y;
+                apos.w                 *= cSim.epsfac;
+                for (j = 0; j < GRID; j++)
+                {
+                    dx              = psA[tj].x - apos.x; 
+                    dy              = psA[tj].y - apos.y; 
+                    dz              = psA[tj].z - apos.z; 
+                    r2              = dx * dx + dy * dy + dz * dz; 
+                    invR            = 1.0f / sqrt(r2);
+                    sig             = psA[tgx].sig2 + psA[tj].sig; 
+                    sig2            = invR * sig; 
+                    sig2           *= sig2;
+                    sig6            = sig2 * sig2 * sig2; 
+                    eps             = psA[tgx].eps2 * psA[tj].eps; 
+                    dEdR            = eps * (12.0f * sig6 - 6.0f) * sig6; 
+                    dEdR           += apos.w * psA[tj].q * invR; 
+                    dEdR           *= invR * invR; 
+                    if (!(excl & 0x1))
+                    {
+                        dEdR = 0.0f;
+                    }
+                    dx             *= dEdR; 
+                    dy             *= dEdR; 
+                    dz             *= dEdR; 
+                    af.x           -= dx; 
+                    af.y           -= dy; 
+                    af.z           -= dz; 
+                    psA[tj].fx     += dx; 
+                    psA[tj].fy     += dy; 
+                    psA[tj].fz     += dz;
+                    excl          >>= 1;
+                    tj              = sNext[tj]; 
+                }
+                // Write results
+                float4 of;
+                of.x                                = af.x;
+                of.y                                = af.y;
+                of.z                                = af.z;
+                of.w                                = 0.0f;
+                int offset                          = x + tgx + (y >> GRIDBITS) * cSim.stride;
+                cSim.pForce4a[offset]               = of;
+                of.x                                = sA[threadIdx.x].fx;
+                of.y                                = sA[threadIdx.x].fy;
+                of.z                                = sA[threadIdx.x].fz;
+                offset                              = y + tgx + (x >> GRIDBITS) * cSim.stride;
+                cSim.pForce4a[offset]               = of;
+            }
+        }
+        pos -= cSim.nonbond_workBlock;     
+    }
+}
+__global__ extern void kCalculateCDLJForces_12_kernel();
+void kCalculateCDLJForces(gpuContext gpu)
+{
+//    printf("kCalculateCDLJForces\n");
+    if (gpu->sm_version < SM_12)
+        kCalculateCDLJForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
+    else
+        kCalculateCDLJForces_12_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
+    LAUNCHERROR("kCalculateCDLJForces");
+}
\ No newline at end of file
--- a/platforms/cuda/src/kernels/kCalculateCDLJForces_12.cu
+++ b/platforms/cuda/src/kernels/kCalculateCDLJForces_12.cu
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+#include <stdio.h>
+#include <cuda.h>
+#include <vector_functions.h>
+#include <cstdlib>
+#include <string>
+#include <iostream>
+#include <fstream>
+using namespace std;
+#include "gputypes.h"
+#include "cudatypes.h"
+#define UNROLLXX 0
+#define UNROLLXY 0
+struct Atom {
+    float x;
+    float y;
+    float z;
+    float q;
+    float sig;
+    float eps;
+    float fx;
+    float fy;
+    float fz;
+};
+__shared__ Atom sA[GT2XX_NONBOND_THREADS_PER_BLOCK];
+__shared__ unsigned int sWorkUnit[GT2XX_NONBOND_WORKUNITS_PER_SM];
+__shared__ unsigned int sNext[GRID];
+static __constant__ cudaGmxSimulation cSim;
+void SetCalculateCDLJForces_12Sim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
+}
+void GetCalculateCDLJForces_12Sim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
+}
+__global__ void kCalculateCDLJForces_12_kernel()
+{
+    // Read queue of work blocks once so the remainder of
+    // kernel can run asynchronously    
+    int pos = cSim.nbWorkUnitsPerBlock * blockIdx.x + min(blockIdx.x, cSim.nbWorkUnitsPerBlockRemainder);
+    int end = cSim.nbWorkUnitsPerBlock * (blockIdx.x + 1) + min((blockIdx.x + 1), cSim.nbWorkUnitsPerBlockRemainder);    
+    if (threadIdx.x < end - pos)
+    {
+        sWorkUnit[threadIdx.x] = cSim.pWorkUnit[pos + threadIdx.x];
+    }
+    if (threadIdx.x < GRID)
+    {
+        sNext[threadIdx.x] = (threadIdx.x + 1) & (GRID - 1);
+    }
+    __syncthreads();
+    // Now change pos and end to reflect work queue just read
+    // into shared memory
+    end = end - pos; 
+    pos = end - (threadIdx.x >> GRIDBITS) - 1;
+    while (pos >= 0)
+    {  
+        // Extract cell coordinates from appropriate work unit
+        unsigned int x = sWorkUnit[pos];
+        unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
+        bool bExclusionFlag = (x & 0x1);
+        x = (x >> 17) << GRIDBITS;
+        float4      apos;   // Local atom x, y, z, q
+        float3      af;     // Local atom fx, fy, fz
+        float dx; 
+        float dy; 
+        float dz; 
+        float r2; 
+        float invR; 
+        float sig; 
+        float sig2; 
+        float sig6; 
+        float eps; 
+        float dEdR;  
+        unsigned int tgx = threadIdx.x & (GRID - 1);
+        unsigned int tbx = threadIdx.x - tgx;
+        int tj = tgx; 
+        Atom* psA = &sA[tbx];
+        if (!bExclusionFlag)
+        {
+            if (x == y) // Handle diagonals uniquely at 50% efficiency
+            { 
+                // Read fixed atom data into registers and GRF
+                unsigned int i      = x + tgx;
+                apos                = cSim.pPosq[i];
+                float2 a            = cSim.pAttr[i];
+                sA[threadIdx.x].x   = apos.x;
+                sA[threadIdx.x].y   = apos.y;
+                sA[threadIdx.x].z   = apos.z;
+                sA[threadIdx.x].q   = apos.w;
+                sA[threadIdx.x].sig = a.x;
+                sA[threadIdx.x].eps = a.y;
+                af.x                = 0.0f;
+                af.y                = 0.0f;
+                af.z                = 0.0f;
+                apos.w             *= cSim.epsfac;
+                for (unsigned int j = 0; j < GRID; j++)
+                {
+                    dx              = psA[j].x - apos.x; 
+                    dy              = psA[j].y - apos.y; 
+                    dz              = psA[j].z - apos.z; 
+                    r2              = dx * dx + dy * dy + dz * dz; 
+                    invR            = 1.0f / sqrt(r2);
+                    sig             = a.x + psA[j].sig; 
+                    sig2            = invR * sig; 
+                    sig2           *= sig2;
+                    sig6            = sig2 * sig2 * sig2; 
+                    eps             = a.y * psA[j].eps; 
+                    dEdR            = eps * (12.0f * sig6 - 6.0f) * sig6; 
+                    dEdR           += apos.w * psA[j].q * invR; 
+                    dEdR           *= invR * invR; 
+                    dx             *= dEdR; 
+                    dy             *= dEdR; 
+                    dz             *= dEdR; 
+                    af.x           -= dx; 
+                    af.y           -= dy; 
+                    af.z           -= dz; 
+                }
+                // Write results
+                float4 of;
+                of.x                                = af.x;
+                of.y                                = af.y;
+                of.z                                = af.z;
+                of.w                                = 0.0f;
+                int offset                          = x + tgx + (x >> GRIDBITS) * cSim.stride;
+                cSim.pForce4a[offset]               = of;
+            }         
+            else        // 100% utilization
+            {
+                // Read fixed atom data into registers and GRF
+                int j                   = y + tgx;
+                unsigned int i          = x + tgx;
+                float4 temp             = cSim.pPosq[j];
+                float2 temp1            = cSim.pAttr[j];
+                apos                    = cSim.pPosq[i];
+                float2 a                = cSim.pAttr[i];
+                sA[threadIdx.x].x       = temp.x;
+                sA[threadIdx.x].y       = temp.y;
+                sA[threadIdx.x].z       = temp.z;
+                sA[threadIdx.x].q       = temp.w;
+                sA[threadIdx.x].sig     = temp1.x;
+                sA[threadIdx.x].eps     = temp1.y;
+                sA[threadIdx.x].fx      = af.x = 0.0f;
+                sA[threadIdx.x].fy      = af.y = 0.0f;
+                sA[threadIdx.x].fz      = af.z = 0.0f;
+                apos.w                 *= cSim.epsfac;
+                for (j = 0; j < GRID; j++)
+                {
+                    dx              = psA[tj].x - apos.x; 
+                    dy              = psA[tj].y - apos.y; 
+                    dz              = psA[tj].z - apos.z; 
+                    r2              = dx * dx + dy * dy + dz * dz; 
+                    invR            = 1.0f / sqrt(r2);
+                    sig             = a.x + psA[tj].sig; 
+                    sig2            = invR * sig; 
+                    sig2           *= sig2;
+                    sig6            = sig2 * sig2 * sig2; 
+                    eps             = a.y * psA[tj].eps; 
+                    dEdR            = eps * (12.0f * sig6 - 6.0f) * sig6; 
+                    dEdR           += apos.w * psA[tj].q * invR; 
+                    dEdR           *= invR * invR; 
+                    dx             *= dEdR; 
+                    dy             *= dEdR; 
+                    dz             *= dEdR; 
+                    af.x           -= dx; 
+                    af.y           -= dy; 
+                    af.z           -= dz; 
+                    psA[tj].fx     += dx; 
+                    psA[tj].fy     += dy; 
+                    psA[tj].fz     += dz;
+                    tj              = sNext[tj]; 
+                }
+                // Write results
+                float4 of;
+                of.x                                = af.x;
+                of.y                                = af.y;
+                of.z                                = af.z;
+                of.w                                = 0.0f;
+                int offset                          = x + tgx + (y >> GRIDBITS) * cSim.stride;
+                cSim.pForce4a[offset]               = of;
+                of.x                                = sA[threadIdx.x].fx;
+                of.y                                = sA[threadIdx.x].fy;
+                of.z                                = sA[threadIdx.x].fz;
+                offset                              = y + tgx + (x >> GRIDBITS) * cSim.stride;
+                cSim.pForce4a[offset]               = of;
+            }
+        }
+        else  // bExclusion
+        {
+            // Read exclusion data
+            if (x == y) // Handle diagonals uniquely at 50% efficiency
+            { 
+                // Read fixed atom data into registers and GRF
+                unsigned int excl       = cSim.pExclusion[x * cSim.exclusionStride + y + tgx];                          
+                unsigned int i          = x + tgx;
+                apos                    = cSim.pPosq[i];
+                float2 a                = cSim.pAttr[i];
+                sA[threadIdx.x].x       = apos.x;
+                sA[threadIdx.x].y       = apos.y;
+                sA[threadIdx.x].z       = apos.z;
+                sA[threadIdx.x].q       = apos.w;
+                sA[threadIdx.x].sig     = a.x;
+                sA[threadIdx.x].eps     = a.y;
+                af.x                    = 0.0f;
+                af.y                    = 0.0f;
+                af.z                    = 0.0f;
+                apos.w                 *= cSim.epsfac;
+                for (unsigned int j = 0; j < GRID; j++)
+                {
+                    dx              = psA[j].x - apos.x; 
+                    dy              = psA[j].y - apos.y; 
+                    dz              = psA[j].z - apos.z; 
+                    r2              = dx * dx + dy * dy + dz * dz; 
+                    invR            = 1.0f / sqrt(r2);
+                    sig             = a.x + psA[j].sig; 
+                    sig2            = invR * sig; 
+                    sig2           *= sig2;
+                    sig6            = sig2 * sig2 * sig2; 
+                    eps             = a.y * psA[j].eps; 
+                    dEdR            = eps * (12.0f * sig6 - 6.0f) * sig6; 
+                    dEdR           += apos.w * psA[j].q * invR; 
+                    dEdR           *= invR * invR; 
+                    if (!(excl & 0x1))
+                    {
+                        dEdR = 0.0f;
+                    }
+                    dx             *= dEdR; 
+                    dy             *= dEdR; 
+                    dz             *= dEdR; 
+                    af.x           -= dx; 
+                    af.y           -= dy; 
+                    af.z           -= dz;
+                    excl          >>= 1;               
+                }
+                // Write results
+                float4 of;
+                of.x                                = af.x;
+                of.y                                = af.y;
+                of.z                                = af.z;
+                of.w                                = 0.0f;
+                int offset                          = x + tgx + (x >> GRIDBITS) * cSim.stride;
+                cSim.pForce4a[offset]               = of;
+            }         
+            else        // 100% utilization
+            {
+                // Read fixed atom data into registers and GRF        
+                unsigned int excl       = cSim.pExclusion[x * cSim.exclusionStride + y + tgx];
+                excl                    = (excl >> tgx) | (excl << (GRID - tgx));
+                int j                   = y + tgx;
+                unsigned int i          = x + tgx;
+                float4 temp             = cSim.pPosq[j];
+                float2 temp1            = cSim.pAttr[j];
+                apos                    = cSim.pPosq[i];
+                float2 a                = cSim.pAttr[i];
+                sA[threadIdx.x].x       = temp.x;
+                sA[threadIdx.x].y       = temp.y;
+                sA[threadIdx.x].z       = temp.z;
+                sA[threadIdx.x].q       = temp.w;
+                sA[threadIdx.x].sig     = temp1.x;
+                sA[threadIdx.x].eps     = temp1.y;
+                sA[threadIdx.x].fx      = af.x = 0.0f;
+                sA[threadIdx.x].fy      = af.y = 0.0f;
+                sA[threadIdx.x].fz      = af.z = 0.0f;
+                apos.w                 *= cSim.epsfac;
+                for (j = 0; j < GRID; j++)
+                {
+                    dx              = psA[tj].x - apos.x; 
+                    dy              = psA[tj].y - apos.y; 
+                    dz              = psA[tj].z - apos.z; 
+                    r2              = dx * dx + dy * dy + dz * dz; 
+                    invR            = 1.0f / sqrt(r2);
+                    sig             = a.x + psA[tj].sig; 
+                    sig2            = invR * sig; 
+                    sig2           *= sig2;
+                    sig6            = sig2 * sig2 * sig2; 
+                    eps             = a.y * psA[tj].eps; 
+                    dEdR            = eps * (12.0f * sig6 - 6.0f) * sig6; 
+                    dEdR           += apos.w * psA[tj].q * invR; 
+                    dEdR           *= invR * invR; 
+                    if (!(excl & 0x1))
+                    {
+                        dEdR = 0.0f;
+                    }
+                    dx             *= dEdR; 
+                    dy             *= dEdR; 
+                    dz             *= dEdR; 
+                    af.x           -= dx; 
+                    af.y           -= dy; 
+                    af.z           -= dz; 
+                    psA[tj].fx     += dx; 
+                    psA[tj].fy     += dy; 
+                    psA[tj].fz     += dz;
+                    excl          >>= 1;
+                    tj              = sNext[tj]; 
+                }
+                // Write results
+                float4 of;
+                of.x                                = af.x;
+                of.y                                = af.y;
+                of.z                                = af.z;
+                of.w                                = 0.0f;
+                int offset                          = x + tgx + (y >> GRIDBITS) * cSim.stride;
+                cSim.pForce4a[offset]               = of;
+                of.x                                = sA[threadIdx.x].fx;
+                of.y                                = sA[threadIdx.x].fy;
+                of.z                                = sA[threadIdx.x].fz;
+                offset                              = y + tgx + (x >> GRIDBITS) * cSim.stride;
+                cSim.pForce4a[offset]               = of;
+            }
+        }
+        pos -= cSim.nonbond_workBlock;     
+    }
+}
+void kCalculateCDLJForces_12(gpuContext gpu)
+{
+//    printf("kCalculateCDLJForces_12\n");
+    kCalculateCDLJForces_12_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
+    LAUNCHERROR("kCalculateCDLJForces_12");
+}
--- a/platforms/cuda/src/kernels/kCalculateCDLJObcGbsaForces1.cu
+++ b/platforms/cuda/src/kernels/kCalculateCDLJObcGbsaForces1.cu
--- a/platforms/cuda/src/kernels/kCalculateCDLJObcGbsaForces1_12.cu
+++ b/platforms/cuda/src/kernels/kCalculateCDLJObcGbsaForces1_12.cu
--- a/platforms/cuda/src/kernels/kCalculateLocalForces.cu
+++ b/platforms/cuda/src/kernels/kCalculateLocalForces.cu
--- a/platforms/cuda/src/kernels/kCalculateObcGbsaBornSum.cu
+++ b/platforms/cuda/src/kernels/kCalculateObcGbsaBornSum.cu
--- a/platforms/cuda/src/kernels/kCalculateObcGbsaForces1.cu
+++ b/platforms/cuda/src/kernels/kCalculateObcGbsaForces1.cu
--- a/platforms/cuda/src/kernels/kCalculateObcGbsaForces1_12.cu
+++ b/platforms/cuda/src/kernels/kCalculateObcGbsaForces1_12.cu
--- a/platforms/cuda/src/kernels/kCalculateObcGbsaForces2.cu
+++ b/platforms/cuda/src/kernels/kCalculateObcGbsaForces2.cu
--- a/platforms/cuda/src/kernels/kCalculateObcGbsaForces2_12.cu
+++ b/platforms/cuda/src/kernels/kCalculateObcGbsaForces2_12.cu
--- a/platforms/cuda/src/kernels/kForces.cu
+++ b/platforms/cuda/src/kernels/kForces.cu
--- a/platforms/cuda/src/kernels/kRandom.cu
+++ b/platforms/cuda/src/kernels/kRandom.cu
--- a/platforms/cuda/src/kernels/kUpdateShakeH.cu
+++ b/platforms/cuda/src/kernels/kUpdateShakeH.cu
--- a/platforms/cuda/src/kernels/kVerletUpdate.cu
+++ b/platforms/cuda/src/kernels/kVerletUpdate.cu