Update to files corrupted in r2814

9c4717f1 · Mark Friedrichs · d2365200 · 9c4717f1 · 9c4717f1 · 9c4717f1
Commit 9c4717f1 authored Jun 17, 2011 by Mark Friedrichs
4 changed files
--- a/platforms/cuda/src/kernels/cudatypes.h
+++ b/platforms/cuda/src/kernels/cudatypes.h
-Vim: Warning: Output is not to a terminal
+#ifndef CUDATYPES_H
-[?1049h[?1h=[1;59r[?12;25h[?12l[?25h[27m[m[H[2J[?25l[59;1H"svn-commit.tmp" 15L, 601C[1;1H[33m  1 
+#define CUDATYPES_H
-  2 [m[32m--This line, and those below, will be ignored--[m
-[33m  3 
+/* -------------------------------------------------------------------------- *
-  4 [m[1m[35mM    plugins/amoeba/platforms/cuda/src/AmoebaCudaKernelFactory.cpp[m
+ *                                   OpenMM                                   *
-[33m  5 [m[1m[35mM    plugins/freeEnergy/platforms/reference/src/gbsa/CpuGBVISoftcore.cpp[m
+ * -------------------------------------------------------------------------- *
-[33m  6 [m[1m[35mM    openmmapi/include/openmm/GBVIForce.h[m
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
-[33m  7 [m[1m[35mM    openmmapi/src/GBVIForce.cpp[m
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
-[33m  8 [m[1m[35mM    olla/src/Platform.cpp[m
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
-[33m  9 [m[1m[35mM    platforms/opencl/src/OpenCLContext.h[m
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
-[33m 10 [m[1m[35mM    platforms/cuda/src/CudaKernels.cpp[m
+ *                                                                            *
-[33m 11 [m[1m[35mM    platforms/cuda/src/kernels/kCalculateGBVIBornSum.cu[m
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
-[33m 12 [m[1m[35mM    platforms/cuda/src/kernels/gputypes.h[m
+ * Authors: Scott Le Grand, Peter Eastman                                     *
-[33m 13 [m[1m[35mM    platforms/cuda/src/kernels/cudatypes.h[m
+ * Contributors:                                                              *
-[33m 14 [m[1m[35mM    platforms/cuda/src/kernels/kForces.cu[m
+ *                                                                            *
-[33m 15 [m[1m[35mM    platforms/cuda/src/kernels/gpu.cpp[m
+ * This program is free software: you can redistribute it and/or modify       *
-[1m[34m~                                                                                                                                                                                                                                         [17;1H~                                                                                                                                                                                                                                         [18;1H~                                                                                                                                                                                                                                         [19;1H~                                                                                                                                                                                                                                         [20;1H~                                                                                                                                                                                                                                         [21;1H~                                                                                                                                                                                                                                         [22;1H~                                                                                                                                                                                                                                         [23;1H~                                                                                                                                                                                                                                         [24;1H~                                                                                                                                                                                                                                         [25;1H~                                                                                                                                                                                                                                         [26;1H~                                                                                                                                                                                                                                         [27;1H~                                                                                                                                                                                                                                         [28;1H~                                                                                                                                                                                                                                         [29;1H~                                                                                                                                                                                                                                         [30;1H~                                                                                                                                                                                                                                         [31;1H~                                                                                                                                                                                                                                         [32;1H~                                                                                                                                                                                                                                         [33;1H~                                                                                                                                                                                                                                         [34;1H~                                                                                                                                                                                                                                         [35;1H~                                                                                                                                                                                                                                         [36;1H~                                                                                                                                                                                                                                         [37;1H~                                                                                                                                                                                                                                         [38;1H~                                                                                                                                                                                                                                         [39;1H~                                                                                                                                                                                                                                         [40;1H~                                                                                                                                                                                                                                         [41;1H~                                                                                                                                                                                                                                         [42;1H~                                                                                                                                                                                                                                         [43;1H~                                                                                                                                                                                                                                         [44;1H~                                                                                                                                                                                                                                         [45;1H~                                                                                                                                                                                                                                         [46;1H~                                                                                                                                                                                                                                         [47;1H~                                                                                                                                                                                                                                         [48;1H~                                                                                                                                                                                                                                         [49;1H~                                                                                                                                                                                                                                         [50;1H~                                                                                                                                                                                                                                         [51;1H~                                                                                                                                                                                                                                         [52;1H~                                                                                                                                                                                                                                         [53;1H~                                                                                                                                                                                                                                         [54;1H~                                                                                                                                                                                                                                         [55;1H~                                                                                                                                                                                                                                         [56;1H~                                                                                                                                                                                                                                         [57;1H~                                                                                                                                                                                                                                         [58;1H~                                                                                                                                                                                                                                         [m[59;217H1,0-1[9CAll[1;5H[?12l[?25h[?25l[59;1H[K[59;1H:[?12l[?25hq![?25l[59;1H[K[59;1H[?1l>[?12l[?25h[?1049l
+ * it under the terms of the GNU Lesser General Public License as published   *
-Log message unchanged or not specified
+ * by the Free Software Foundation, either version 3 of the License, or       *
-a)bort, c)ontinue, e)dit
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include <stdarg.h>
+#include <limits>
+#include <iostream>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include <cufft.h>
+#include <builtin_types.h>
+#include <vector_functions.h>
+#define RTERROR(status, s) \
+    if (status != cudaSuccess) { \
+        printf("%s %s\n", s, cudaGetErrorString(status)); \
+        exit(-1); \
+    }
+#define LAUNCHERROR(s) \
+    { \
+        cudaError_t status = cudaGetLastError(); \
+        if (status != cudaSuccess) { \
+            printf("Error: %s launching kernel %s\n", cudaGetErrorString(status), s); \
+            exit(-1); \
+        } \
+    }
+// Pure virtual class to define an interface for objects resident both on GPU and CPU
+struct SoADeviceObject {
+    virtual void Allocate() = 0;
+    virtual void Deallocate() = 0;
+    virtual void Upload() = 0;
+    virtual void Download() = 0;
+};
+template <typename T>
+struct CUDAStream : public SoADeviceObject
+{
+    unsigned int    _length;
+    unsigned int    _subStreams;
+    unsigned int    _stride;
+    T**             _pSysStream;
+    T**             _pDevStream;
+    T*              _pSysData;
+    T*              _pDevData;
+    std::string     _name;
+    CUDAStream(int length, int subStreams = 1, std::string name="");
+    CUDAStream(unsigned int length, unsigned int subStreams = 1, std::string name="");
+    CUDAStream(unsigned int length, int subStreams = 1, std::string name="");
+    CUDAStream(int length, unsigned int subStreams = 1, std::string name="");
+    virtual ~CUDAStream();
+    void Allocate();
+    void Deallocate();
+    void Upload();
+    void Download();
+    void CopyFrom(const CUDAStream<T>& src);
+    void Collapse(unsigned int newstreams = 1, unsigned int interleave = 1);
+    T& operator[](int index);
+};
+float CompareStreams(CUDAStream<float>& s1, CUDAStream<float>& s2, float tolerance, unsigned int maxindex = 0);
+template <typename T>
+CUDAStream<T>::CUDAStream(int length, unsigned int subStreams, std::string name) : _length(length), _subStreams(subStreams), _stride((length + 0xf) & 0xfffffff0), _name(name)
+{
+    Allocate();   
+}
+template <typename T>
+CUDAStream<T>::CUDAStream(unsigned int length, int subStreams, std::string name) : _length(length), _subStreams(subStreams), _stride((length + 0xf) & 0xfffffff0), _name(name)
+{
+    Allocate();   
+}
+template <typename T>
+CUDAStream<T>::CUDAStream(unsigned int length, unsigned int subStreams, std::string name) : _length(length), _subStreams(subStreams), _stride((length + 0xf) & 0xfffffff0), _name(name)
+{
+    Allocate();   
+}
+template <typename T>
+CUDAStream<T>::CUDAStream(int length, int subStreams, std::string name) : _length(length), _subStreams(subStreams), _stride((length + 0xf) & 0xfffffff0), _name(name)
+{
+    Allocate();   
+}
+template <typename T>
+CUDAStream<T>::~CUDAStream()
+{
+    Deallocate();
+}
+template <typename T>
+void CUDAStream<T>::Allocate()
+{
+    cudaError_t status;
+    _pSysStream =   new T*[_subStreams];
+    _pDevStream =   new T*[_subStreams];
+    _pSysData =     new T[_subStreams * _stride];
+    status = cudaMalloc((void **) &_pDevData, _stride * _subStreams * sizeof(T));
+    RTERROR(status, (_name+": cudaMalloc in CUDAStream::Allocate failed").c_str());
+    for (unsigned int i = 0; i < _subStreams; i++)
+    {
+        _pSysStream[i] = _pSysData + i * _stride;
+        _pDevStream[i] = _pDevData + i * _stride;
+    }
+}
+template <typename T>
+void CUDAStream<T>::Deallocate()
+{
+    cudaError_t status;
+    delete[] _pSysStream;
+    _pSysStream = NULL;
+    delete[] _pDevStream;
+    _pDevStream = NULL;
+    delete[] _pSysData;
+    _pSysData = NULL;
+    status = cudaFree(_pDevData);
+    RTERROR(status, (_name+": cudaFree in CUDAStream::Deallocate failed").c_str());
+}
+template <typename T>
+void CUDAStream<T>::Upload()
+{
+    cudaError_t status;
+    status = cudaMemcpy(_pDevData, _pSysData, _stride * _subStreams * sizeof(T), cudaMemcpyHostToDevice);
+    RTERROR(status, (_name+": cudaMemcpy in CUDAStream::Upload failed").c_str());
+}
+template <typename T>
+void CUDAStream<T>::Download()
+{
+    cudaError_t status;
+    status = cudaMemcpy(_pSysData, _pDevData, _stride * _subStreams * sizeof(T), cudaMemcpyDeviceToHost);
+    RTERROR(status, (_name+": cudaMemcpy in CUDAStream::Download failed").c_str());
+}
+template <typename T>
+void CUDAStream<T>::CopyFrom(const CUDAStream<T>& src)
+{
+    cudaError_t status;
+    status = cudaMemcpy(_pDevData, src._pDevData, _stride * _subStreams * sizeof(T), cudaMemcpyDeviceToDevice);
+    RTERROR(status, (_name+": cudaMemcpy in CUDAStream::Copy failed").c_str());
+}
+template <typename T>
+void CUDAStream<T>::Collapse(unsigned int newstreams, unsigned int interleave)
+{
+    T* pTemp = new T[_subStreams * _stride];
+    unsigned int stream = 0;
+    unsigned int pos = 0;
+    unsigned int newstride = _stride * _subStreams / newstreams;
+    unsigned int newlength = _length * _subStreams / newstreams;
+    // Copy data into new format
+    for (unsigned int i = 0; i < _length; i++)
+    {
+        for (unsigned int j = 0; j < _subStreams; j++)
+        {
+            pTemp[stream * newstride + pos] = _pSysStream[j][i];
+            stream++;
+            if (stream == newstreams)
+            {
+                stream = 0;
+                pos++;
+            }
+        }
+    }
+    // Remap stream pointers;
+    for (unsigned int i = 0; i < newstreams; i++)
+    {
+        _pSysStream[i] = _pSysData + i * newstride;
+        _pDevStream[i] = _pDevData + i * newstride;
+    }
+    // Copy data back intro original stream
+    for (unsigned int i = 0; i < newlength; i++)
+        for (unsigned int j = 0; j < newstreams; j++)
+            _pSysStream[j][i] = pTemp[j * newstride + i];
+    _stride = newstride;
+    _length = newlength;
+    _subStreams = newstreams;
+    delete[] pTemp;
+}
+template <typename T>
+T& CUDAStream<T>::operator[](int index)
+{
+    return _pSysData[index];
+}
+static const unsigned int GRID = 32;
+static const unsigned int GRIDBITS = 5;
+static const int G8X_BLOCKS_PER_SM                      = 1;
+static const int GT2XX_BLOCKS_PER_SM                    = 1;
+static const int GF1XX_BLOCKS_PER_SM                    = 1;
+static const int G8X_NONBOND_THREADS_PER_BLOCK          = 256;
+static const int GT2XX_NONBOND_THREADS_PER_BLOCK        = 320;
+static const int GF1XX_NONBOND_THREADS_PER_BLOCK        = 768;
+//static const int GF1XX_NONBOND_THREADS_PER_BLOCK        = 768;
+static const int G8X_BORNFORCE2_THREADS_PER_BLOCK       = 256;
+static const int GT2XX_BORNFORCE2_THREADS_PER_BLOCK     = 320;
+static const int GF1XX_BORNFORCE2_THREADS_PER_BLOCK     = 768;
+//static const int GF1XX_BORNFORCE2_THREADS_PER_BLOCK     = 768;
+static const int G8X_SHAKE_THREADS_PER_BLOCK            = 128;
+static const int GT2XX_SHAKE_THREADS_PER_BLOCK          = 256;
+static const int GF1XX_SHAKE_THREADS_PER_BLOCK          = 512;
+static const int G8X_UPDATE_THREADS_PER_BLOCK           = 192;
+static const int GT2XX_UPDATE_THREADS_PER_BLOCK         = 384;
+static const int GF1XX_UPDATE_THREADS_PER_BLOCK         = 768;
+static const int G8X_LOCALFORCES_THREADS_PER_BLOCK      = 192;
+static const int GT2XX_LOCALFORCES_THREADS_PER_BLOCK    = 384;
+static const int GF1XX_LOCALFORCES_THREADS_PER_BLOCK    = 768;
+static const int G8X_THREADS_PER_BLOCK                  = 256;
+static const int GT2XX_THREADS_PER_BLOCK                = 256;
+static const int GF1XX_THREADS_PER_BLOCK                = 512;
+static const int G8X_RANDOM_THREADS_PER_BLOCK           = 256;
+static const int GT2XX_RANDOM_THREADS_PER_BLOCK         = 384;
+static const int GF1XX_RANDOM_THREADS_PER_BLOCK         = 768;
+static const int G8X_NONBOND_WORKUNITS_PER_SM           = 220;
+static const int GT2XX_NONBOND_WORKUNITS_PER_SM         = 256;
+static const int GF1XX_NONBOND_WORKUNITS_PER_SM         = 768;
+static const unsigned int MAX_STACK_SIZE = 8;
+static const unsigned int MAX_TABULATED_FUNCTIONS = 4;
+static const float PI = 3.14159265358979323846f;
+static const int PME_ORDER = 5;
+enum CudaNonbondedMethod
+{
+    NO_CUTOFF,
+    CUTOFF,
+    PERIODIC,
+    EWALD,
+    PARTICLE_MESH_EWALD
+};
+enum ExpressionOp {
+    VARIABLE0 = 0, VARIABLE1, VARIABLE2, VARIABLE3, VARIABLE4, VARIABLE5, VARIABLE6, VARIABLE7, VARIABLE8, MULTIPLY, DIVIDE, ADD, SUBTRACT, POWER, MULTIPLY_CONSTANT, POWER_CONSTANT, ADD_CONSTANT,
+        GLOBAL, CONSTANT, CUSTOM, CUSTOM_DERIV, NEGATE, RECIPROCAL, SQRT, EXP, LOG, SQUARE, CUBE, STEP, SIN, COS, SEC, CSC, TAN, COT, ASIN, ACOS, ATAN, SINH, COSH, TANH, ERF, ERFC,
+        MIN, MAX, ABS
+};
+template<int SIZE>
+struct Expression {
+    int op[SIZE];
+    float arg[SIZE];
+    int length, stackSize;
+};
+struct cudaGmxSimulation {
+    // Constants
+    unsigned int    atoms;                          // Number of atoms
+    unsigned int    paddedNumberOfAtoms;            // Padded number of atoms
+    unsigned int    blocks;                         // Number of blocks to launch across linear kernels
+    unsigned int    blocksPerSM;                    // Number of blocks per share memory
+    unsigned int    nonbond_blocks;                 // Number of blocks to launch across CDLJ and Born Force Part1
+    unsigned int    bornForce2_blocks;              // Number of blocks to launch across Born Force 2
+    unsigned int    interaction_blocks;             // Number of blocks to launch when identifying interacting tiles
+    unsigned int    threads_per_block;              // Threads per block to launch
+    unsigned int    nonbond_threads_per_block;      // Threads per block in nonbond kernel calls
+    unsigned int    bornForce2_threads_per_block;   // Threads per block in nonbond kernel calls
+    unsigned int    max_update_threads_per_block;   // Maximum threads per block in update kernel calls
+    unsigned int    update_threads_per_block;       // Threads per block in update kernel calls
+    unsigned int    bf_reduce_threads_per_block;    // Threads per block in Born Force reduction calls
+    unsigned int    bsf_reduce_threads_per_block;   // Threads per block in Born Sum And Forces reduction calls
+    unsigned int    max_shake_threads_per_block;    // Maximum threads per block in shake kernel calls
+    unsigned int    shake_threads_per_block;        // Threads per block in shake kernel calls
+    unsigned int    settle_threads_per_block;       // Threads per block in SETTLE kernel calls
+    unsigned int    ccma_threads_per_block;         // Threads per block in CCMA kernel calls
+    unsigned int    max_localForces_threads_per_block;  // Threads per block in local forces kernel calls
+    unsigned int    localForces_threads_per_block;  // Threads per block in local forces kernel calls
+    unsigned int    random_threads_per_block;       // Threads per block in RNG kernel calls
+    unsigned int    interaction_threads_per_block;  // Threads per block when identifying interacting tiles
+    unsigned int    custom_exception_threads_per_block; // Threads per block in custom nonbonded exception kernel calls
+    unsigned int    customExpressionStackSize;      // Stack size for evaluating custom nonbonded forces
+    unsigned int    workUnits;                      // Number of work units
+    unsigned int*   pWorkUnit;                      // Pointer to work units
+    unsigned int*   pInteractingWorkUnit;           // Pointer to work units that have interactions
+    unsigned int*   pInteractionFlag;               // Flags for which work units have interactions
+    float2*         pStepSize;                      // The size of the previous and current time steps
+    float*          pLangevinParameters;            // Parameters used for Langevin integration
+    float           errorTol;                       // Error tolerance for selecting the step size
+    size_t*         pInteractionCount;              // A count of the number of work units which have interactions
+    unsigned int    nonbond_workBlock;              // Number of work units running simultaneously per block in CDLJ and Born Force Part 1
+    unsigned int    bornForce2_workBlock;           // Number of work units running second half of Born Forces calculation
+    unsigned int    workUnitsPerSM;                 // Number of workblocks per SM
+    unsigned int    nbWorkUnitsPerBlock;            // Number of work units assigned to each nonbond block
+    unsigned int    nbWorkUnitsPerBlockRemainder;   // Remainder of work units to assign across lower numbered nonbond blocks
+    unsigned int    bf2WorkUnitsPerBlock;           // Number of work units assigned to each bornForce2 block
+    unsigned int    bf2WorkUnitsPerBlockRemainder;  // Remainder of work units to assign across lower numbered bornForce2 blocks
+    unsigned int    stride;                         // Atomic attributes stride
+    unsigned int    stride2;                        // Atomic attributes stride x 2
+    unsigned int    stride3;                        // Atomic attributes stride x 3
+    unsigned int    stride4;                        // Atomic attributes stride x 4
+    unsigned int    nonbondOutputBuffers;           // Nonbond output buffers per nonbond call
+    unsigned int    outputBuffers;                  // Number of output buffers
+    unsigned int    energyOutputBuffers;            // Number of energy output buffers
+    float           bigFloat;                       // Floating point value used as a flag for Shaken atoms 
+    float           epsfac;                         // Epsilon factor for CDLJ calculations
+    CudaNonbondedMethod nonbondedMethod;            // How to handle nonbonded interactions
+    CudaNonbondedMethod customNonbondedMethod;      // How to handle custom nonbonded interactions
+    float           nonbondedCutoff;                // Cutoff distance for nonbonded interactions
+    float           nonbondedCutoffSqr;             // Square of the cutoff distance for nonbonded interactions
+    float           periodicBoxSizeX;               // The X dimension of the periodic box
+    float           periodicBoxSizeY;               // The Y dimension of the periodic box
+    float           periodicBoxSizeZ;               // The Z dimension of the periodic box
+    float           invPeriodicBoxSizeX;            // The 1 over the X dimension of the periodic box
+    float           invPeriodicBoxSizeY;            // The 1 over the Y dimension of the periodic box
+    float           invPeriodicBoxSizeZ;            // The 1 over the Z dimension of the periodic box
+    float           recipBoxSizeX;                  // The X dimension of the reciprocal box for Ewald summation
+    float           recipBoxSizeY;                  // The Y dimension of the reciprocal box for Ewald summation
+    float           recipBoxSizeZ;                  // The Z dimension of the reciprocal box for Ewald summation
+    float           cellVolume;                     // Ewald parameter alpha (a.k.a. kappa)
+    float           alphaEwald;                     // Ewald parameter alpha (a.k.a. kappa)
+    float           factorEwald;                    // - 1 ( 4 * alphaEwald * alphaEwald)
+    int             kmaxX;                          // Maximum number of reciprocal vectors in the X direction
+    int             kmaxY;                          // Maximum number of reciprocal vectors in the Y direction
+    int             kmaxZ;                          // Maximum number of reciprocal vectors in the Z direction
+    float           reactionFieldK;                 // Constant for reaction field correction
+    float           reactionFieldC;                 // Constant for reaction field correction
+    float           probeRadius;                    // SASA probe radius
+    float           surfaceAreaFactor;              // ACE approximation surface area factor
+    float           electricConstant;               // ACE approximation electric constant
+    float           forceConversionFactor;          // kJ to kcal force conversion factor
+    float           preFactor;                      // Born electrostatic pre-factor
+    float           dielectricOffset;               // Born dielectric offset
+    float           alphaOBC;                       // OBC alpha factor
+    float           betaOBC;                        // OBC beta factor
+    float           gammaOBC;                       // OBC gamma factor
+    float           deltaT;                         // Molecular dynamics deltaT constant
+    float           oneOverDeltaT;                  // 1/deltaT
+    float           T;                              // Temperature
+    float           kT;                             // Boltzmann's constant times T
+    float           noiseAmplitude;                 // The magnitude of the noise for Brownian dynamics
+    float           tau;                            // Inverse friction for Langevin or Brownian dynamics
+    float           tauDeltaT;                      // tau*deltaT
+    float           collisionFrequency;             // Collision frequency for Andersen thermostat
+    float2*         pObcData;                       // Pointer to fixed Born data
+    int             gbviBornRadiusScalingMethod;    // scaling method for GB/VI Born radii
+    float           gbviQuinticLowerLimitFactor;    // Lower limit factor for scaing of GB/VI Born radii using quintic spline
+    float           gbviQuinticUpperBornRadiusLimit;// Upper limit for GB/VI Born radii 
+    float4*         pGBVIData;                      // Pointer to fixed Born data for GB/VI algorithm
+    float*          pGBVISwitchDerivative;          // Pointer to GB/VI Born switch derivatives
+    float2*         pAttr;                          // Pointer to additional atom attributes (sig, eps)
+    float4*         pCustomParams;                  // Pointer to atom parameters for custom nonbonded force
+    unsigned int    customExceptions;               // Number of custom nonbonded exceptions
+    unsigned int    customParameters;               // Number of parameters for custom nonbonded interactions
+    int4*           pCustomBondID;                  // Atom indices for custom bonds
+    float4*         pCustomBondParams;              // Parameters for custom bonds
+    unsigned int    customBonds;                    // Number of custom bonds
+    unsigned int    customBondParameters;           // Number of parameters for custom bonds
+    int4*           pCustomAngleID1;                // Atom indices for custom angles
+    int2*           pCustomAngleID2;                // Atom indices for custom angles
+    float4*         pCustomAngleParams;             // Parameters for custom angles
+    unsigned int    customAngles;                   // Number of custom angles
+    unsigned int    customAngleParameters;          // Number of parameters for custom angles
+    int4*           pCustomTorsionID1;              // Atom indices for custom torsions
+    int4*           pCustomTorsionID2;              // Atom indices for custom torsions
+    float4*         pCustomTorsionParams;           // Parameters for custom torsions
+    unsigned int    customTorsions;                 // Number of custom torsions
+    unsigned int    customTorsionParameters;        // Number of parameters for custom torsions
+    int*            pCustomExternalID;              // Atom indices for custom external force
+    float4*         pCustomExternalParams;          // Parameters for custom external force
+    unsigned int    customExternals;                // Number of particles for custom external force
+    unsigned int    customExternalParameters;       // Number of parameters for custom external force
+    float4*         pTabulatedFunctionCoefficients[MAX_TABULATED_FUNCTIONS]; // The spline coefficients for each tabulated function
+    float4*         pTabulatedFunctionParams;       // The min, max, and spacing for each tabulated function
+    float2*         pEwaldCosSinSum;                // Pointer to the cos/sin sums (ewald)
+    float*          pTabulatedErfc;                 // Tabulated values for erfc()
+    int             tabulatedErfcSize;              // The number of tabulated values for erfc()
+    float           tabulatedErfcScale;             // Scale factor for the argument to erfc()
+    int3            pmeGridSize;                    // The dimensions of the grid for particle mesh Ewald
+    int3            pmeGroupSize;                   // The dimensions of the groups used in charge spreading for PME
+    cufftComplex*   pPmeGrid;                       // Grid points for particle mesh Ewald
+    float*          pPmeBsplineModuli[3];
+    float4*         pPmeBsplineTheta;
+    float4*         pPmeBsplineDtheta;
+    int*            pPmeAtomRange;                  // The range of sorted atoms at each grid point
+    int2*           pPmeAtomGridIndex;              // The grid point each atom is at
+    unsigned int    bonds;                          // Number of bonds
+    int4*           pBondID;                        // Bond atom and output buffer IDs
+    float2*         pBondParameter;                 // Bond parameters
+    unsigned int    bond_angles;                    // Number of bond angles
+    int4*           pBondAngleID1;                  // Bond angle atom and first output buffer IDs
+    int2*           pBondAngleID2;                  // Bond angle output buffer IDs
+    float2*         pBondAngleParameter;            // Bond angle parameters
+    unsigned int    dihedrals;                      // Number of dihedrals
+    int4*           pDihedralID1;                   // Dihedral IDs
+    int4*           pDihedralID2;                   // Dihedral output buffer IDs
+    float4*         pDihedralParameter;             // Dihedral parameters
+    unsigned int    rb_dihedrals;                   // Number of Ryckaert Bellemans dihedrals
+    int4*           pRbDihedralID1;                 // Ryckaert Bellemans Dihedral IDs
+    int4*           pRbDihedralID2;                 // Ryckaert Bellemans Dihedral output buffer IDs
+    float4*         pRbDihedralParameter1;          // Ryckaert Bellemans Dihedral parameters
+    float2*         pRbDihedralParameter2;          // Ryckaert Bellemans Dihedral parameters
+    unsigned int    LJ14s;                          // Number of Lennard Jones 1-4 interactions
+    int4*           pLJ14ID;                        // Lennard Jones 1-4 atom and output buffer IDs
+    float4*         pLJ14Parameter;                 // Lennard Jones 1-4 parameters
+    float           inverseTotalMass;               // Used in linear momentum removal
+    unsigned int    ShakeConstraints;               // Total number of Shake constraints
+    unsigned int    settleConstraints;              // Total number of Settle constraints
+    unsigned int    ccmaConstraints;                // Total number of CCMA constraints.
+    unsigned int    rigidClusters;                  // Total number of rigid clusters
+    unsigned int    maxRigidClusterSize;            // The size of the largest rigid cluster
+    unsigned int    clusterShakeBlockSize;          // The number of threads to process each rigid cluster
+    unsigned int    maxShakeIterations;             // Maximum shake iterations
+    unsigned int    degreesOfFreedom;               // Number of degrees of freedom in system
+    float           shakeTolerance;                 // Shake tolerance
+    float           InvMassJ;                       // Shake inverse mass for hydrogens
+    int*            pNonShakeID;                    // Not Shaking atoms
+    int4*           pShakeID;                       // Shake atoms and phase
+    float4*         pShakeParameter;                // Shake parameters
+    int4*           pSettleID;                      // Settle atoms
+    float2*         pSettleParameter;               // Settle parameters
+    unsigned int*   pExclusion;                     // Nonbond exclusion data
+    unsigned int*   pExclusionIndex;                // Index of exclusion data for each work unit
+    unsigned int    bond_offset;                    // Offset to end of bonds
+    unsigned int    bond_angle_offset;              // Offset to end of bond angles
+    unsigned int    dihedral_offset;                // Offset to end of dihedrals
+    unsigned int    rb_dihedral_offset;             // Offset to end of Ryckaert Bellemans dihedrals
+    unsigned int    LJ14_offset;                    // Offset to end of Lennard Jones 1-4 parameters
+    int*            pAtomIndex;                     // The original index of each atom
+    float4*         pGridBoundingBox;               // The size of each grid cell
+    float4*         pGridCenter;                    // The center of each grid cell
+    int2*           pCcmaAtoms;                     // The atoms connected by each CCMA constraint
+    float4*         pCcmaDistance;                  // The displacement vector (x, y, z) and constraint distance (w) for each CCMA constraint
+    float*          pCcmaDelta1;                    // Workspace for CCMA
+    float*          pCcmaDelta2;                    // Workspace for CCMA
+    int*            pCcmaAtomConstraints;           // The indices of constraints involving each atom
+    int*            pCcmaNumAtomConstraints;        // The number of constraints involving each atom
+    int*            ccmaConvergedDeviceMarker;      // Device memory used to communicate that CCMA has converged
+    float*          pCcmaReducedMass;               // The reduced mass for each CCMA constraint
+    unsigned int*   pConstraintMatrixColumn;        // The column of each element in the constraint matrix.
+    float*          pConstraintMatrixValue;         // The value of each element in the constraint matrix.
+    // Mutable stuff
+    float4*         pPosq;                          // Pointer to atom positions and charges
+    float4*         pPosqP;                         // Pointer to mid-integration atom positions
+    float4*         pOldPosq;                       // Pointer to old atom positions
+    float4*         pVelm4;                         // Pointer to atom velocity and inverse mass
+    float4*         pForce4;                        // Pointer to force data
+    float*          pEnergy;                        // Pointer to energy output buffer
+    float*          pBornForce;                     // Pointer to Born force data
+    float*	    pBornSum;                       // Pointer to Born Radii calculation output buffers
+    float*	    pBornRadii;                     // Pointer to Born Radii
+    float*          pObcChain;                      // Pointer to OBC chain data
+    float4*         pLinearMomentum;                // Pointer to linear momentum
+    // Random numbers
+    float4*         pRandom4;                       // Pointer to 4 random numbers
+    float2*         pRandom2;                       // Pointer to 2 random numbers
+    uint4*          pRandomSeed;                    // Pointer to random seeds
+    int*            pRandomPosition;                // Pointer to random number positions
+    unsigned int    randoms;                        // Number of randoms
+    unsigned int    totalRandoms;                   // Number of randoms plus overflow.
+    unsigned int    randomIterations;               // Number of iterations before regenerating randoms
+    unsigned int    randomFrames;                   // Number of frames of random numbers
+};
+struct Vectors {
+    float3 v0;
+    float3 v1;
+    float3 v2;
+};
+#endif
--- a/platforms/cuda/src/kernels/gpu.cpp
+++ b/platforms/cuda/src/kernels/gpu.cpp
-Vim: Warning: Output is not to a terminal
+/* -------------------------------------------------------------------------- *
-[?1049h[?1h=[1;59r[?12;25h[?12l[?25h[27m[m[H[2J[?25l[59;1H"svn-commit.tmp" 15L, 601C[1;1H[33m  1 
+ *                                   OpenMM                                   *
-  2 [m[32m--This line, and those below, will be ignored--[m
+ * -------------------------------------------------------------------------- *
-[33m  3 
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
-  4 [m[1m[35mM    plugins/amoeba/platforms/cuda/src/AmoebaCudaKernelFactory.cpp[m
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
-[33m  5 [m[1m[35mM    plugins/freeEnergy/platforms/reference/src/gbsa/CpuGBVISoftcore.cpp[m
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
-[33m  6 [m[1m[35mM    openmmapi/include/openmm/GBVIForce.h[m
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
-[33m  7 [m[1m[35mM    openmmapi/src/GBVIForce.cpp[m
+ *                                                                            *
-[33m  8 [m[1m[35mM    olla/src/Platform.cpp[m
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
-[33m  9 [m[1m[35mM    platforms/opencl/src/OpenCLContext.h[m
+ * Authors: Scott Le Grand, Peter Eastman                                     *
-[33m 10 [m[1m[35mM    platforms/cuda/src/CudaKernels.cpp[m
+ * Contributors:                                                              *
-[33m 11 [m[1m[35mM    platforms/cuda/src/kernels/kCalculateGBVIBornSum.cu[m
+ *                                                                            *
-[33m 12 [m[1m[35mM    platforms/cuda/src/kernels/gputypes.h[m
+ * This program is free software: you can redistribute it and/or modify       *
-[33m 13 [m[1m[35mM    platforms/cuda/src/kernels/cudatypes.h[m
+ * it under the terms of the GNU Lesser General Public License as published   *
-[33m 14 [m[1m[35mM    platforms/cuda/src/kernels/kForces.cu[m
+ * by the Free Software Foundation, either version 3 of the License, or       *
-[33m 15 [m[1m[35mM    platforms/cuda/src/kernels/gpu.cpp[m
+ * (at your option) any later version.                                        *
-[1m[34m~                                                                                                                                                                                                                                         [17;1H~                                                                                                                                                                                                                                         [18;1H~                                                                                                                                                                                                                                         [19;1H~                                                                                                                                                                                                                                         [20;1H~                                                                                                                                                                                                                                         [21;1H~                                                                                                                                                                                                                                         [22;1H~                                                                                                                                                                                                                                         [23;1H~                                                                                                                                                                                                                                         [24;1H~                                                                                                                                                                                                                                         [25;1H~                                                                                                                                                                                                                                         [26;1H~                                                                                                                                                                                                                                         [27;1H~                                                                                                                                                                                                                                         [28;1H~                                                                                                                                                                                                                                         [29;1H~                                                                                                                                                                                                                                         [30;1H~                                                                                                                                                                                                                                         [31;1H~                                                                                                                                                                                                                                         [32;1H~                                                                                                                                                                                                                                         [33;1H~                                                                                                                                                                                                                                         [34;1H~                                                                                                                                                                                                                                         [35;1H~                                                                                                                                                                                                                                         [36;1H~                                                                                                                                                                                                                                         [37;1H~                                                                                                                                                                                                                                         [38;1H~                                                                                                                                                                                                                                         [39;1H~                                                                                                                                                                                                                                         [40;1H~                                                                                                                                                                                                                                         [41;1H~                                                                                                                                                                                                                                         [42;1H~                                                                                                                                                                                                                                         [43;1H~                                                                                                                                                                                                                                         [44;1H~                                                                                                                                                                                                                                         [45;1H~                                                                                                                                                                                                                                         [46;1H~                                                                                                                                                                                                                                         [47;1H~                                                                                                                                                                                                                                         [48;1H~                                                                                                                                                                                                                                         [49;1H~                                                                                                                                                                                                                                         [50;1H~                                                                                                                                                                                                                                         [51;1H~                                                                                                                                                                                                                                         [52;1H~                                                                                                                                                                                                                                         [53;1H~                                                                                                                                                                                                                                         [54;1H~                                                                                                                                                                                                                                         [55;1H~                                                                                                                                                                                                                                         [56;1H~                                                                                                                                                                                                                                         [57;1H~                                                                                                                                                                                                                                         [58;1H~                                                                                                                                                                                                                                         [m[59;217H1,0-1[9CAll[1;5H[?12l[?25h[?25l[59;1H[K[59;1H:[?12l[?25hq![?25l[59;1H[K[59;1H[?1l>[?12l[?25h[?1049l
+ *                                                                            *
-Log message unchanged or not specified
+ * This program is distributed in the hope that it will be useful,            *
-a)bort, c)ontinue, e)dit
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include <stdio.h>
+#include <string.h>
+#include <cuda.h>
+#include <vector_functions.h>
+#include <cstdlib>
+#include <string>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <cmath>
+#include <map>
+#include <set>
+#include <algorithm>
+#ifdef WIN32
+  #define _USE_MATH_DEFINES /* M_PI */
+  #include <math.h>
+  #include <windows.h>
+#else
+  #include <stdint.h>
+#endif
+using namespace std;
+#include "gputypes.h"
+#include "cudaKernels.h"
+#include "hilbert.h"
+#include "openmm/OpenMMException.h"
+#include "openmm/internal/SplineFitter.h"
+#include "quern.h"
+#include "Lepton.h"
+#include "rng.h"
+#include "../CudaForceInfo.h"
+// In case we're using some primitive version of Visual Studio this will
+// make sure that erf() and erfc() are defined.
+#include "openmm/internal/MSVC_erfc.h"
+#include "openmm/internal/windowsExport.h"
+using OpenMM::OpenMMException;
+using Lepton::Operation;
+struct ShakeCluster {
+    int centralID;
+    int peripheralID[3];
+    int size;
+    bool valid;
+    float distance;
+    float centralInvMass, peripheralInvMass;
+    ShakeCluster() : valid(true) {
+    }
+    ShakeCluster(int centralID, float invMass) : centralID(centralID), centralInvMass(invMass), size(0), valid(true) {
+    }
+    void addAtom(int id, float dist, float invMass) {
+        if (size == 3 || (size > 0 && dist != distance) || (size > 0 && invMass != peripheralInvMass))
+            valid = false;
+        else {
+            peripheralID[size++] = id;
+            distance = dist;
+            peripheralInvMass = invMass;
+        }
+    }
+};
+struct Constraint
+{
+    Constraint(int atom1, int atom2, float distance2) : atom1(atom1), atom2(atom2), distance2(distance2) {
+    }
+    int atom1, atom2;
+    float distance2;
+};
+struct ConstraintOrderer : public binary_function<int, int, bool> {
+    const vector<int>& atom1;
+    const vector<int>& atom2;
+    ConstraintOrderer(const vector<int>& atom1, const vector<int>& atom2) : atom1(atom1), atom2(atom2) {
+    }
+    bool operator()(int x, int y) {
+        if (atom1[x] != atom1[y])
+            return atom1[x] < atom1[y];
+        return atom2[x] < atom2[y];
+    }
+};
+struct Molecule {
+    vector<int> atoms;
+    vector<int> constraints;
+    vector<vector<int> > groups;
+};
+static const float dielectricOffset         =    0.009f;
+static const float probeRadius              =    0.14f;
+static const float forceConversionFactor    =    0.4184f;
+//static const float surfaceAreaFactor        =   -6.0f * 0.06786f * forceConversionFactor * 1000.0f;  // PI * 4.0f * 0.0049f * 1000.0f;
+//static const float surfaceAreaFactor        =   -6.0f * PI * 4.0f * 0.0049f * 1000.0f;
+static const float surfaceAreaFactor        = -6.0f*PI*0.0216f*1000.0f*0.4184f;
+//static const float surfaceAreaFactor        = -1.7035573959e+001;
+//static const float surfaceAreaFactor        = -166.03185f;
+//static const float surfaceAreaFactor        = 1.0f;
+static const float alphaOBC                 =    1.0f;
+static const float betaOBC                  =    0.8f;
+static const float gammaOBC                 =    4.85f;
+static const float kcalMolTokJNM            =   -0.4184f;
+static const float electricConstant         = -166.03185f;
+static const float defaultInnerDielectric   =    1.0f;
+static const float defaultSolventDielectric =   78.3f;
+static const float KILO                     =    1e3;                      // Thousand
+static const float BOLTZMANN                =    1.380658e-23f;            // (J/K)    
+static const float AVOGADRO                 =    6.0221367e23f;            // ()        
+static const float RGAS                     =    BOLTZMANN * AVOGADRO;     // (J/(mol K))
+static const float BOLTZ                    =    (RGAS / KILO);            // (kJ/(mol K)) 
+#define DUMP_PARAMETERS 0
+template <int SIZE>
+static Expression<SIZE> createExpression(gpuContext gpu, const string& expression, const Lepton::ExpressionProgram& program, const vector<string>& variables,
+        const vector<string>& globalParamNames, unsigned int& maxStackSize) {
+    Expression<SIZE> exp;
+    if (program.getNumOperations() > SIZE)
+        throw OpenMMException("Expression contains too many operations: "+expression);
+    exp.length = program.getNumOperations();
+    exp.stackSize = program.getStackSize();
+    if (exp.stackSize > (int) maxStackSize)
+        maxStackSize = exp.stackSize;
+    for (int i = 0; i < program.getNumOperations(); i++) {
+        const Operation& op = program.getOperation(i);
+        switch (op.getId()) {
+            case Operation::CONSTANT:
+                exp.op[i] = CONSTANT;
+                exp.arg[i] = (float) dynamic_cast<const Operation::Constant*>(&op)->getValue();
+                break;
+            case Operation::VARIABLE:
+                if (variables.size() > 0 && op.getName() == variables[0])
+                    exp.op[i] = VARIABLE0;
+                else if (variables.size() > 1 && op.getName() == variables[1])
+                    exp.op[i] = VARIABLE1;
+                else if (variables.size() > 2 && op.getName() == variables[2])
+                    exp.op[i] = VARIABLE2;
+                else if (variables.size() > 3 && op.getName() == variables[3])
+                    exp.op[i] = VARIABLE3;
+                else if (variables.size() > 4 && op.getName() == variables[4])
+                    exp.op[i] = VARIABLE4;
+                else if (variables.size() > 5 && op.getName() == variables[5])
+                    exp.op[i] = VARIABLE5;
+                else if (variables.size() > 6 && op.getName() == variables[6])
+                    exp.op[i] = VARIABLE6;
+                else if (variables.size() > 7 && op.getName() == variables[7])
+                    exp.op[i] = VARIABLE7;
+                else if (variables.size() > 8 && op.getName() == variables[8])
+                    exp.op[i] = VARIABLE8;
+                else {
+                    int j;
+                    for (j = 0; j < (int) globalParamNames.size() && op.getName() != globalParamNames[j]; j++);
+                    if (j == globalParamNames.size())
+                        throw OpenMMException("Unknown variable '"+op.getName()+"' in expression: "+expression);
+                    exp.op[i] = GLOBAL;
+                    exp.arg[i] = (float) j;
+                }
+                break;
+            case Operation::CUSTOM:
+                exp.op[i] = dynamic_cast<const Operation::Custom*>(&op)->getDerivOrder()[0] == 0 ? CUSTOM : CUSTOM_DERIV;
+                for (int j = 0; j < MAX_TABULATED_FUNCTIONS; j++)
+                    if (op.getName() == gpu->tabulatedFunctions[j].name) {
+                        exp.arg[i] = (float) j;
+                        break;
+                    }
+                break;
+            case Operation::ADD:
+                exp.op[i] = ADD;
+                break;
+            case Operation::SUBTRACT:
+                exp.op[i] = SUBTRACT;
+                break;
+            case Operation::MULTIPLY:
+                exp.op[i] = MULTIPLY;
+                break;
+            case Operation::DIVIDE:
+                exp.op[i] = DIVIDE;
+                break;
+            case Operation::POWER:
+                exp.op[i] = POWER;
+                break;
+            case Operation::NEGATE:
+                exp.op[i] = NEGATE;
+                break;
+            case Operation::SQRT:
+                exp.op[i] = SQRT;
+                break;
+            case Operation::EXP:
+                exp.op[i] = EXP;
+                break;
+            case Operation::LOG:
+                exp.op[i] = LOG;
+                break;
+            case Operation::SIN:
+                exp.op[i] = SIN;
+                break;
+            case Operation::COS:
+                exp.op[i] = COS;
+                break;
+            case Operation::SEC:
+                exp.op[i] = SEC;
+                break;
+            case Operation::CSC:
+                exp.op[i] = CSC;
+                break;
+            case Operation::TAN:
+                exp.op[i] = TAN;
+                break;
+            case Operation::COT:
+                exp.op[i] = COT;
+                break;
+            case Operation::ASIN:
+                exp.op[i] = ASIN;
+                break;
+            case Operation::ACOS:
+                exp.op[i] = ACOS;
+                break;
+            case Operation::ATAN:
+                exp.op[i] = ATAN;
+                break;
+            case Operation::SINH:
+                exp.op[i] = SINH;
+                break;
+            case Operation::COSH:
+                exp.op[i] = COSH;
+                break;
+            case Operation::TANH:
+                exp.op[i] = TANH;
+                break;
+            case Operation::ERF:
+                exp.op[i] = ERF;
+                break;
+            case Operation::ERFC:
+                exp.op[i] = ERFC;
+                break;
+            case Operation::STEP:
+                exp.op[i] = STEP;
+                break;
+            case Operation::SQUARE:
+                exp.op[i] = SQUARE;
+                break;
+            case Operation::CUBE:
+                exp.op[i] = CUBE;
+                break;
+            case Operation::RECIPROCAL:
+                exp.op[i] = RECIPROCAL;
+                break;
+            case Operation::ADD_CONSTANT:
+                exp.op[i] = ADD_CONSTANT;
+                exp.arg[i] = (float) dynamic_cast<const Operation::AddConstant*>(&op)->getValue();
+                break;
+            case Operation::MULTIPLY_CONSTANT:
+                exp.op[i] = MULTIPLY_CONSTANT;
+                exp.arg[i] = (float) dynamic_cast<const Operation::MultiplyConstant*>(&op)->getValue();
+                break;
+            case Operation::POWER_CONSTANT:
+                exp.op[i] = POWER_CONSTANT;
+                exp.arg[i] = (float) dynamic_cast<const Operation::PowerConstant*>(&op)->getValue();
+                break;
+            case Operation::MIN:
+                exp.op[i] = MIN;
+                break;
+            case Operation::MAX:
+                exp.op[i] = MAX;
+                break;
+            case Operation::ABS:
+                exp.op[i] = ABS;
+                break;
+        }
+    }
+    return exp;
+}
+extern "C"
+void gpuSetBondParameters(gpuContext gpu, const vector<int>& atom1, const vector<int>& atom2, const vector<float>& length, const vector<float>& k)
+{
+    int bonds = atom1.size();
+    gpu->sim.bonds                              = bonds;
+    CUDAStream<int4>* psBondID                  = new CUDAStream<int4>(bonds, 1, "BondID");
+    gpu->psBondID                               = psBondID;
+    gpu->sim.pBondID                            = psBondID->_pDevStream[0];
+    CUDAStream<float2>* psBondParameter         = new CUDAStream<float2>(bonds, 1, "BondParameter");
+    gpu->psBondParameter                        = psBondParameter;
+    gpu->sim.pBondParameter                     = psBondParameter->_pDevStream[0];
+    for (int i = 0; i < bonds; i++)
+    {
+        (*psBondID)[i].x = atom1[i];
+        (*psBondID)[i].y = atom2[i];
+        (*psBondParameter)[i].x = length[i];
+        (*psBondParameter)[i].y = k[i];
+        psBondID->_pSysData[i].z = gpu->pOutputBufferCounter[psBondID->_pSysData[i].x]++;
+        psBondID->_pSysData[i].w = gpu->pOutputBufferCounter[psBondID->_pSysData[i].y]++;
+#if (DUMP_PARAMETERS == 1)                
+        cout << 
+            i << " " << 
+            (*psBondID)[i].x << " " <<
+            (*psBondID)[i].y << " " <<
+            (*psBondID)[i].z << " " <<
+            (*psBondID)[i].w << " " <<
+            (*psBondParameter)[i].x << " " <<
+            (*psBondParameter)[i].y <<
+            endl;
+#endif
+    }
+    psBondID->Upload();
+    psBondParameter->Upload();
+}
+extern "C"
+void gpuSetBondAngleParameters(gpuContext gpu, const vector<int>& atom1, const vector<int>& atom2, const vector<int>& atom3,
+        const vector<float>& angle, const vector<float>& k)
+{
+    int bond_angles = atom1.size();
+    gpu->sim.bond_angles                        = bond_angles;
+    CUDAStream<int4>* psBondAngleID1            = new CUDAStream<int4>(bond_angles, 1, "BondAngleID1");
+    gpu->psBondAngleID1                         = psBondAngleID1;
+    gpu->sim.pBondAngleID1                      = psBondAngleID1->_pDevStream[0];
+    CUDAStream<int2>* psBondAngleID2            = new CUDAStream<int2>(bond_angles, 1, "BondAngleID2");
+    gpu->psBondAngleID2                         = psBondAngleID2;
+    gpu->sim.pBondAngleID2                      = psBondAngleID2->_pDevStream[0];
+    CUDAStream<float2>* psBondAngleParameter    = new CUDAStream<float2>(bond_angles, 1, "BondAngleParameter");
+    gpu->psBondAngleParameter                   = psBondAngleParameter;
+    gpu->sim.pBondAngleParameter                = psBondAngleParameter->_pDevStream[0];        
+    for (int i = 0; i < bond_angles; i++)
+    {
+        (*psBondAngleID1)[i].x = atom1[i];
+        (*psBondAngleID1)[i].y = atom2[i];
+        (*psBondAngleID1)[i].z = atom3[i];
+        (*psBondAngleParameter)[i].x = angle[i];
+        (*psBondAngleParameter)[i].y = k[i];
+        psBondAngleID1->_pSysData[i].w = gpu->pOutputBufferCounter[psBondAngleID1->_pSysData[i].x]++;
+        psBondAngleID2->_pSysData[i].x = gpu->pOutputBufferCounter[psBondAngleID1->_pSysData[i].y]++;
+        psBondAngleID2->_pSysData[i].y = gpu->pOutputBufferCounter[psBondAngleID1->_pSysData[i].z]++;
+#if (DUMP_PARAMETERS == 1)
+         cout << 
+            i << " " << 
+            (*psBondAngleID1)[i].x << " " <<
+            (*psBondAngleID1)[i].y << " " <<
+            (*psBondAngleID1)[i].z << " " <<
+            (*psBondAngleID1)[i].w << " " <<
+            (*psBondAngleID2)[i].x << " " <<
+            (*psBondAngleID2)[i].y << " " <<
+            (*psBondAngleParameter)[i].x << " " <<
+            (*psBondAngleParameter)[i].y <<
+            endl;
+#endif
+    }
+    psBondAngleID1->Upload();
+    psBondAngleID2->Upload();
+    psBondAngleParameter->Upload();
+}
+extern "C"
+void gpuSetDihedralParameters(gpuContext gpu, const vector<int>& atom1, const vector<int>& atom2, const vector<int>& atom3, const vector<int>& atom4,
+        const vector<float>& k, const vector<float>& phase, const vector<int>& periodicity)
+{
+        int dihedrals = atom1.size();
+        gpu->sim.dihedrals = dihedrals;
+        CUDAStream<int4>* psDihedralID1             = new CUDAStream<int4>(dihedrals, 1, "DihedralID1");
+        gpu->psDihedralID1                          = psDihedralID1;
+        gpu->sim.pDihedralID1                       = psDihedralID1->_pDevStream[0];
+        CUDAStream<int4>* psDihedralID2             = new CUDAStream<int4>(dihedrals, 1, "DihedralID2");
+        gpu->psDihedralID2                          = psDihedralID2;
+        gpu->sim.pDihedralID2                       = psDihedralID2->_pDevStream[0];
+        CUDAStream<float4>* psDihedralParameter     = new CUDAStream<float4>(dihedrals, 1, "DihedralParameter");
+        gpu->psDihedralParameter                    = psDihedralParameter;
+        gpu->sim.pDihedralParameter                 = psDihedralParameter->_pDevStream[0];
+        for (int i = 0; i < dihedrals; i++)
+        {
+            (*psDihedralID1)[i].x = atom1[i];
+            (*psDihedralID1)[i].y = atom2[i];
+            (*psDihedralID1)[i].z = atom3[i];
+            (*psDihedralID1)[i].w = atom4[i];
+            (*psDihedralParameter)[i].x = k[i];
+            (*psDihedralParameter)[i].y = phase[i];
+            (*psDihedralParameter)[i].z = (float) periodicity[i];
+            psDihedralID2->_pSysData[i].x = gpu->pOutputBufferCounter[psDihedralID1->_pSysData[i].x]++;
+            psDihedralID2->_pSysData[i].y = gpu->pOutputBufferCounter[psDihedralID1->_pSysData[i].y]++;
+            psDihedralID2->_pSysData[i].z = gpu->pOutputBufferCounter[psDihedralID1->_pSysData[i].z]++;
+            psDihedralID2->_pSysData[i].w = gpu->pOutputBufferCounter[psDihedralID1->_pSysData[i].w]++;
+#if (DUMP_PARAMETERS == 1)
+            cout << 
+                i << " " << 
+                (*psDihedralID1)[i].x << " " <<
+                (*psDihedralID1)[i].y << " " <<
+                (*psDihedralID1)[i].z << " " <<
+                (*psDihedralID1)[i].w << " " <<
+                (*psDihedralID2)[i].x << " " <<
+                (*psDihedralID2)[i].y << " " <<
+                (*psDihedralID2)[i].z << " " <<
+                (*psDihedralID2)[i].w << " " <<
+                (*psDihedralParameter)[i].x << " " <<
+                (*psDihedralParameter)[i].y << " " <<
+                (*psDihedralParameter)[i].z << endl;
+#endif
+        }
+        psDihedralID1->Upload();
+        psDihedralID2->Upload();
+        psDihedralParameter->Upload();
+}
+extern "C"
+void gpuSetRbDihedralParameters(gpuContext gpu, const vector<int>& atom1, const vector<int>& atom2, const vector<int>& atom3, const vector<int>& atom4,
+        const vector<float>& c0, const vector<float>& c1, const vector<float>& c2, const vector<float>& c3, const vector<float>& c4, const vector<float>& c5)
+{
+    int rb_dihedrals = atom1.size();
+    gpu->sim.rb_dihedrals = rb_dihedrals;
+    CUDAStream<int4>* psRbDihedralID1           = new CUDAStream<int4>(rb_dihedrals, 1, "RbDihedralID1");
+    gpu->psRbDihedralID1                        = psRbDihedralID1;
+    gpu->sim.pRbDihedralID1                     = psRbDihedralID1->_pDevStream[0];
+    CUDAStream<int4>* psRbDihedralID2           = new CUDAStream<int4>(rb_dihedrals, 1, "RbDihedralID2");
+    gpu->psRbDihedralID2                        = psRbDihedralID2;
+    gpu->sim.pRbDihedralID2                     = psRbDihedralID2->_pDevStream[0];
+    CUDAStream<float4>* psRbDihedralParameter1  = new CUDAStream<float4>(rb_dihedrals, 1, "RbDihedralParameter1");
+    gpu->psRbDihedralParameter1                 = psRbDihedralParameter1;
+    gpu->sim.pRbDihedralParameter1              = psRbDihedralParameter1->_pDevStream[0];
+    CUDAStream<float2>* psRbDihedralParameter2  = new CUDAStream<float2>(rb_dihedrals, 1, "RbDihedralParameter2");
+    gpu->psRbDihedralParameter2                 = psRbDihedralParameter2;
+    gpu->sim.pRbDihedralParameter2              = psRbDihedralParameter2->_pDevStream[0];
+    for (int i = 0; i < rb_dihedrals; i++)
+    {
+        (*psRbDihedralID1)[i].x = atom1[i];
+        (*psRbDihedralID1)[i].y = atom2[i];
+        (*psRbDihedralID1)[i].z = atom3[i];
+        (*psRbDihedralID1)[i].w = atom4[i];
+        (*psRbDihedralParameter1)[i].x = c0[i];
+        (*psRbDihedralParameter1)[i].y = c1[i];
+        (*psRbDihedralParameter1)[i].z = c2[i];
+        (*psRbDihedralParameter1)[i].w = c3[i];
+        (*psRbDihedralParameter2)[i].x = c4[i];
+        (*psRbDihedralParameter2)[i].y = c5[i];
+        psRbDihedralID2->_pSysData[i].x = gpu->pOutputBufferCounter[psRbDihedralID1->_pSysData[i].x]++;
+        psRbDihedralID2->_pSysData[i].y = gpu->pOutputBufferCounter[psRbDihedralID1->_pSysData[i].y]++;
+        psRbDihedralID2->_pSysData[i].z = gpu->pOutputBufferCounter[psRbDihedralID1->_pSysData[i].z]++;
+        psRbDihedralID2->_pSysData[i].w = gpu->pOutputBufferCounter[psRbDihedralID1->_pSysData[i].w]++;
+#if (DUMP_PARAMETERS == 1)
+        cout << 
+            i << " " << 
+            (*psRbDihedralID1)[i].x << " " <<
+            (*psRbDihedralID1)[i].y << " " <<
+            (*psRbDihedralID1)[i].z << " " <<
+            (*psRbDihedralID1)[i].w <<" " <<
+            (*psRbDihedralID2)[i].x << " " <<
+            (*psRbDihedralID2)[i].y << " " <<
+            (*psRbDihedralID2)[i].z << " " <<
+            (*psRbDihedralID2)[i].w <<" " <<
+            (*psRbDihedralParameter1)[i].x << " " <<
+            (*psRbDihedralParameter1)[i].y << " " <<
+            (*psRbDihedralParameter1)[i].z << " " <<
+            (*psRbDihedralParameter1)[i].w << " " <<
+            (*psRbDihedralParameter2)[i].x << " " <<
+            (*psRbDihedralParameter2)[i].y <<
+            endl;
+#endif
+    }
+    psRbDihedralID1->Upload();
+    psRbDihedralID2->Upload();
+    psRbDihedralParameter1->Upload();
+    psRbDihedralParameter2->Upload();
+}
+extern "C"
+void gpuSetLJ14Parameters(gpuContext gpu, float epsfac, float fudge, const vector<int>& atom1, const vector<int>& atom2,
+        const vector<float>& c6, const vector<float>& c12, const vector<float>& q1, const vector<float>& q2)
+{
+    int LJ14s = atom1.size();
+    float scale = epsfac * fudge;
+    gpu->sim.LJ14s                              = LJ14s;
+    CUDAStream<int4>* psLJ14ID                  = new CUDAStream<int4>(LJ14s, 1, "LJ14ID");
+    gpu->psLJ14ID                               = psLJ14ID;
+    gpu->sim.pLJ14ID                            = psLJ14ID->_pDevStream[0];
+    CUDAStream<float4>* psLJ14Parameter         = new CUDAStream<float4>(LJ14s, 1, "LJ14Parameter");
+    gpu->psLJ14Parameter                        = psLJ14Parameter;
+    gpu->sim.pLJ14Parameter                     = psLJ14Parameter->_pDevStream[0];
+    for (int i = 0; i < LJ14s; i++)
+    {
+        (*psLJ14ID)[i].x = atom1[i];
+        (*psLJ14ID)[i].y = atom2[i];
+        psLJ14ID->_pSysData[i].z = gpu->pOutputBufferCounter[psLJ14ID->_pSysData[i].x]++;
+        psLJ14ID->_pSysData[i].w = gpu->pOutputBufferCounter[psLJ14ID->_pSysData[i].y]++;
+        float p0, p1, p2;
+        if (c12[i] == 0.0f)
+        {
+            p0 = 0.0f;
+            p1 = 1.0f;
+        }
+        else
+        {
+            p0 = c6[i] * c6[i] / c12[i];
+            p1 = pow(c12[i] / c6[i], 1.0f / 6.0f);
+        }
+        p2 = scale * q1[i] * q2[i];
+        (*psLJ14Parameter)[i].x = p0;
+        (*psLJ14Parameter)[i].y = p1;
+        (*psLJ14Parameter)[i].z = p2;
+    }
+#if (DUMP_PARAMETERS == 1)
+        cout << 
+            i << " " <<
+            (*psLJ14ID)[i].x << " " <<
+            (*psLJ14ID)[i].y << " " <<
+            (*psLJ14ID)[i].z << " " <<
+            (*psLJ14ID)[i].w << " " <<
+            (*psLJ14Parameter)[i].x << " " <<
+            (*psLJ14Parameter)[i].y << " " <<
+            (*psLJ14Parameter)[i].z << " " <<
+            p0 << " " << 
+            p1 << " " << 
+            p2 << " " << 
+            endl;
+#endif
+    psLJ14ID->Upload();
+    psLJ14Parameter->Upload();
+}
+extern "C" void setExclusions(gpuContext gpu, const vector<vector<int> >& exclusions) {
+    if (gpu->exclusions.size() > 0) {
+        bool ok = (exclusions.size() == gpu->exclusions.size());
+        for (int i = 0; i < (int) exclusions.size() && ok; i++) {
+            if (exclusions[i].size() != gpu->exclusions[i].size())
+                ok = false;
+            else {
+                for (int j = 0; j < (int) exclusions[i].size(); j++)
+                    if (find(gpu->exclusions[i].begin(), gpu->exclusions[i].end(), exclusions[i][j]) == gpu->exclusions[i].end())
+                        ok = false;
+            }
+        }
+        if (!ok)
+            throw OpenMMException("All nonbonded forces must have identical sets of exceptions");
+    }
+    gpu->exclusions = exclusions;
+}
+extern "C"
+void gpuSetCoulombParameters(gpuContext gpu, float epsfac, const vector<int>& atom, const vector<float>& c6, const vector<float>& c12, const vector<float>& q,
+        const vector<char>& symbol, const vector<vector<int> >& exclusions, CudaNonbondedMethod method)
+{
+    unsigned int coulombs = c6.size();
+    gpu->sim.epsfac = epsfac;
+    gpu->sim.nonbondedMethod = method;
+    if (coulombs > 0)
+        setExclusions(gpu, exclusions);
+    for (unsigned int i = 0; i < coulombs; i++)
+    {
+            float p0 = q[i];
+            float p1 = 0.5f, p2 = 0.0f;               
+            if ((c6[i] > 0.0f) && (c12[i] > 0.0f))
+            {
+                p1 = 0.5f * pow(c12[i] / c6[i], 1.0f / 6.0f);
+                p2 = c6[i] * sqrt(1.0f / c12[i]);
+            }
+            if (symbol.size() > 0)
+                gpu->pAtomSymbol[i] = symbol[i];
+            (*gpu->psPosq4)[i].w = p0;
+            (*gpu->psSigEps2)[i].x = p1;
+            (*gpu->psSigEps2)[i].y = p2;
+    }
+    // Dummy out extra atom data
+    for (unsigned int i = gpu->natoms; i < gpu->sim.paddedNumberOfAtoms; i++)
+    {
+        (*gpu->psPosq4)[i].x       = 100000.0f + i * 10.0f;
+        (*gpu->psPosq4)[i].y       = 100000.0f + i * 10.0f;
+        (*gpu->psPosq4)[i].z       = 100000.0f + i * 10.0f;
+        (*gpu->psPosq4)[i].w       = 0.0f;
+        (*gpu->psSigEps2)[i].x     = 0.0f;
+        (*gpu->psSigEps2)[i].y     = 0.0f;
+    }
+    gpu->psPosq4->Upload();
+    gpu->psSigEps2->Upload();
+}
+extern "C"
+void gpuSetNonbondedCutoff(gpuContext gpu, float cutoffDistance, float solventDielectric)
+{
+    if (gpu->sim.nonbondedCutoff != 0.0f && gpu->sim.nonbondedCutoff != cutoffDistance)
+        throw OpenMMException("All nonbonded forces must use the same cutoff");
+    gpu->sim.nonbondedCutoff = cutoffDistance;
+    gpu->sim.nonbondedCutoffSqr = cutoffDistance*cutoffDistance;
+    gpu->sim.reactionFieldK = pow(cutoffDistance, -3.0f)*(solventDielectric-1.0f)/(2.0f*solventDielectric+1.0f);
+    gpu->sim.reactionFieldC = (1.0f / cutoffDistance)*(3.0f*solventDielectric)/(2.0f*solventDielectric+1.0f);
+}
+extern "C"
+void gpuSetTabulatedFunction(gpuContext gpu, int index, const string& name, const vector<double>& values, double min, double max)
+{
+    if (index < 0 || index >= MAX_TABULATED_FUNCTIONS) {
+        stringstream str;
+        str << "Only " << MAX_TABULATED_FUNCTIONS << " tabulated functions are supported";
+        throw OpenMMException(str.str());
+    }
+    if (gpu->tabulatedFunctions[index].coefficients != NULL)
+        delete gpu->tabulatedFunctions[index].coefficients;
+    CUDAStream<float4>* coeff = new CUDAStream<float4>((int) values.size()-1, 1, "TabulatedFunction");
+    gpu->tabulatedFunctions[index].coefficients = coeff;
+    gpu->sim.pTabulatedFunctionCoefficients[index] = coeff->_pDevData;
+    gpu->tabulatedFunctions[index].name = name;
+    gpu->tabulatedFunctions[index].min = min;
+    gpu->tabulatedFunctions[index].max = max;
+    gpu->tabulatedFunctionsChanged = true;
+    // Compute the spline coefficients.
+    int numValues = values.size();
+    vector<double> x(numValues), derivs;
+    for (int i = 0; i < numValues; i++)
+        x[i] = min+i*(max-min)/(numValues-1);
+    OpenMM::SplineFitter::createNaturalSpline(x, values, derivs);
+    for (int i = 0; i < (int) values.size()-1; i++)
+        (*coeff)[i] = make_float4((float) values[i], (float) values[i+1], (float) (derivs[i]/6.0), (float) (derivs[i+1]/6.0));
+    coeff->Upload();
+}
+extern "C"
+void gpuSetCustomBondParameters(gpuContext gpu, const vector<int>& bondAtom1, const vector<int>& bondAtom2, const vector<vector<double> >& bondParams,
+            const string& energyExp, const vector<string>& paramNames, const vector<string>& globalParamNames)
+{
+    if (paramNames.size() > 4)
+        throw OpenMMException("CudaPlatform only supports four per-bond parameters for custom bond forces");
+    if (globalParamNames.size() > 8)
+        throw OpenMMException("CudaPlatform only supports eight global parameters for custom bond forces");
+    if (gpu->psCustomBondID != NULL)
+        throw OpenMMException("CudaPlatform only supports a single CustomBondForce per System");
+    gpu->sim.customBonds = bondAtom1.size();
+    gpu->sim.customBondParameters = paramNames.size();
+    gpu->psCustomBondID = new CUDAStream<int4>(gpu->sim.customBonds, 1, "CustomBondId");
+    gpu->sim.pCustomBondID = gpu->psCustomBondID->_pDevData;
+    gpu->psCustomBondParams = new CUDAStream<float4>(gpu->sim.customBonds, 1, "CustomBondParams");
+    gpu->sim.pCustomBondParams = gpu->psCustomBondParams->_pDevData;
+    vector<int> forceBufferCounter(gpu->natoms, 0);
+    for (int i = 0; i < (int) bondAtom1.size(); i++) {
+        (*gpu->psCustomBondID)[i].x = bondAtom1[i];
+        (*gpu->psCustomBondID)[i].y = bondAtom2[i];
+        (*gpu->psCustomBondID)[i].z = forceBufferCounter[bondAtom1[i]]++;
+        (*gpu->psCustomBondID)[i].w = forceBufferCounter[bondAtom2[i]]++;
+        if (bondParams[i].size() > 0)
+            (*gpu->psCustomBondParams)[i].x = (float) bondParams[i][0];
+        if (bondParams[i].size() > 1)
+            (*gpu->psCustomBondParams)[i].y = (float) bondParams[i][1];
+        if (bondParams[i].size() > 2)
+            (*gpu->psCustomBondParams)[i].z = (float) bondParams[i][2];
+        if (bondParams[i].size() > 3)
+            (*gpu->psCustomBondParams)[i].w = (float) bondParams[i][3];
+    }
+    gpu->psCustomBondID->Upload();
+    gpu->psCustomBondParams->Upload();
+    for (int i = 0; i < (int) forceBufferCounter.size(); i++)
+        if (forceBufferCounter[i] > (int) gpu->pOutputBufferCounter[i])
+            gpu->pOutputBufferCounter[i] = forceBufferCounter[i];
+    // Create the Expressions.
+    vector<string> variables;
+    variables.push_back("r");
+    for (int i = 0; i < (int) paramNames.size(); i++)
+        variables.push_back(paramNames[i]);
+    SetCustomBondEnergyExpression(createExpression<256>(gpu, energyExp, Lepton::Parser::parse(energyExp).optimize().createProgram(), variables, globalParamNames, gpu->sim.customExpressionStackSize));
+    SetCustomBondForceExpression(createExpression<256>(gpu, energyExp, Lepton::Parser::parse(energyExp).differentiate("r").optimize().createProgram(), variables, globalParamNames, gpu->sim.customExpressionStackSize));
+}
+extern "C"
+void gpuSetCustomAngleParameters(gpuContext gpu, const vector<int>& angleAtom1, const vector<int>& angleAtom2, const vector<int>& angleAtom3, const vector<vector<double> >& angleParams,
+            const string& energyExp, const vector<string>& paramNames, const vector<string>& globalParamNames)
+{
+    if (paramNames.size() > 4)
+        throw OpenMMException("CudaPlatform only supports four per-angle parameters for custom angle forces");
+    if (globalParamNames.size() > 8)
+        throw OpenMMException("CudaPlatform only supports eight global parameters for custom angle forces");
+    if (gpu->psCustomAngleID1 != NULL)
+        throw OpenMMException("CudaPlatform only supports a single CustomAngleForce per System");
+    gpu->sim.customAngles = angleAtom1.size();
+    gpu->sim.customAngleParameters = paramNames.size();
+    gpu->psCustomAngleID1 = new CUDAStream<int4>(gpu->sim.customAngles, 1, "CustomAngleId1");
+    gpu->sim.pCustomAngleID1 = gpu->psCustomAngleID1->_pDevData;
+    gpu->psCustomAngleID2 = new CUDAStream<int2>(gpu->sim.customAngles, 1, "CustomAngleId2");
+    gpu->sim.pCustomAngleID2 = gpu->psCustomAngleID2->_pDevData;
+    gpu->psCustomAngleParams = new CUDAStream<float4>(gpu->sim.customAngles, 1, "CustomAngleParams");
+    gpu->sim.pCustomAngleParams = gpu->psCustomAngleParams->_pDevData;
+    vector<int> forceBufferCounter(gpu->natoms, 0);
+    for (int i = 0; i < (int) angleAtom1.size(); i++) {
+        (*gpu->psCustomAngleID1)[i].x = angleAtom1[i];
+        (*gpu->psCustomAngleID1)[i].y = angleAtom2[i];
+        (*gpu->psCustomAngleID1)[i].z = angleAtom3[i];
+        (*gpu->psCustomAngleID1)[i].w = forceBufferCounter[angleAtom1[i]]++;
+        (*gpu->psCustomAngleID2)[i].x = forceBufferCounter[angleAtom2[i]]++;
+        (*gpu->psCustomAngleID2)[i].y = forceBufferCounter[angleAtom3[i]]++;
+        if (angleParams[i].size() > 0)
+            (*gpu->psCustomAngleParams)[i].x = (float) angleParams[i][0];
+        if (angleParams[i].size() > 1)
+            (*gpu->psCustomAngleParams)[i].y = (float) angleParams[i][1];
+        if (angleParams[i].size() > 2)
+            (*gpu->psCustomAngleParams)[i].z = (float) angleParams[i][2];
+        if (angleParams[i].size() > 3)
+            (*gpu->psCustomAngleParams)[i].w = (float) angleParams[i][3];
+    }
+    gpu->psCustomAngleID1->Upload();
+    gpu->psCustomAngleID2->Upload();
+    gpu->psCustomAngleParams->Upload();
+    for (int i = 0; i < (int) forceBufferCounter.size(); i++)
+        if (forceBufferCounter[i] > (int) gpu->pOutputBufferCounter[i])
+            gpu->pOutputBufferCounter[i] = forceBufferCounter[i];
+    // Create the Expressions.
+    vector<string> variables;
+    variables.push_back("theta");
+    for (int i = 0; i < (int) paramNames.size(); i++)
+        variables.push_back(paramNames[i]);
+    SetCustomAngleEnergyExpression(createExpression<256>(gpu, energyExp, Lepton::Parser::parse(energyExp).optimize().createProgram(), variables, globalParamNames, gpu->sim.customExpressionStackSize));
+    SetCustomAngleForceExpression(createExpression<256>(gpu, energyExp, Lepton::Parser::parse(energyExp).differentiate("theta").optimize().createProgram(), variables, globalParamNames, gpu->sim.customExpressionStackSize));
+}
+extern "C"
+void gpuSetCustomTorsionParameters(gpuContext gpu, const vector<int>& torsionAtom1, const vector<int>& torsionAtom2, const vector<int>& torsionAtom3, const vector<int>& torsionAtom4, const vector<vector<double> >& torsionParams,
+            const string& energyExp, const vector<string>& paramNames, const vector<string>& globalParamNames)
+{
+    if (paramNames.size() > 4)
+        throw OpenMMException("CudaPlatform only supports four per-torsion parameters for custom torsion forces");
+    if (globalParamNames.size() > 8)
+        throw OpenMMException("CudaPlatform only supports eight global parameters for custom torsion forces");
+    if (gpu->psCustomTorsionID1 != NULL)
+        throw OpenMMException("CudaPlatform only supports a single CustomTorsionForce per System");
+    gpu->sim.customTorsions = torsionAtom1.size();
+    gpu->sim.customTorsionParameters = paramNames.size();
+    gpu->psCustomTorsionID1 = new CUDAStream<int4>(gpu->sim.customTorsions, 1, "CustomTorsionId1");
+    gpu->sim.pCustomTorsionID1 = gpu->psCustomTorsionID1->_pDevData;
+    gpu->psCustomTorsionID2 = new CUDAStream<int4>(gpu->sim.customTorsions, 1, "CustomTorsionId2");
+    gpu->sim.pCustomTorsionID2 = gpu->psCustomTorsionID2->_pDevData;
+    gpu->psCustomTorsionParams = new CUDAStream<float4>(gpu->sim.customTorsions, 1, "CustomTorsionParams");
+    gpu->sim.pCustomTorsionParams = gpu->psCustomTorsionParams->_pDevData;
+    vector<int> forceBufferCounter(gpu->natoms, 0);
+    for (int i = 0; i < (int) torsionAtom1.size(); i++) {
+        (*gpu->psCustomTorsionID1)[i].x = torsionAtom1[i];
+        (*gpu->psCustomTorsionID1)[i].y = torsionAtom2[i];
+        (*gpu->psCustomTorsionID1)[i].z = torsionAtom3[i];
+        (*gpu->psCustomTorsionID1)[i].w = torsionAtom4[i];
+        (*gpu->psCustomTorsionID2)[i].x = forceBufferCounter[torsionAtom1[i]]++;
+        (*gpu->psCustomTorsionID2)[i].y = forceBufferCounter[torsionAtom2[i]]++;
+        (*gpu->psCustomTorsionID2)[i].z = forceBufferCounter[torsionAtom3[i]]++;
+        (*gpu->psCustomTorsionID2)[i].w = forceBufferCounter[torsionAtom4[i]]++;
+        if (torsionParams[i].size() > 0)
+            (*gpu->psCustomTorsionParams)[i].x = (float) torsionParams[i][0];
+        if (torsionParams[i].size() > 1)
+            (*gpu->psCustomTorsionParams)[i].y = (float) torsionParams[i][1];
+        if (torsionParams[i].size() > 2)
+            (*gpu->psCustomTorsionParams)[i].z = (float) torsionParams[i][2];
+        if (torsionParams[i].size() > 3)
+            (*gpu->psCustomTorsionParams)[i].w = (float) torsionParams[i][3];
+    }
+    gpu->psCustomTorsionID1->Upload();
+    gpu->psCustomTorsionID2->Upload();
+    gpu->psCustomTorsionParams->Upload();
+    for (int i = 0; i < (int) forceBufferCounter.size(); i++)
+        if (forceBufferCounter[i] > (int) gpu->pOutputBufferCounter[i])
+            gpu->pOutputBufferCounter[i] = forceBufferCounter[i];
+    // Create the Expressions.
+    vector<string> variables;
+    variables.push_back("theta");
+    for (int i = 0; i < (int) paramNames.size(); i++)
+        variables.push_back(paramNames[i]);
+    SetCustomTorsionEnergyExpression(createExpression<256>(gpu, energyExp, Lepton::Parser::parse(energyExp).optimize().createProgram(), variables, globalParamNames, gpu->sim.customExpressionStackSize));
+    SetCustomTorsionForceExpression(createExpression<256>(gpu, energyExp, Lepton::Parser::parse(energyExp).differentiate("theta").optimize().createProgram(), variables, globalParamNames, gpu->sim.customExpressionStackSize));
+}
+extern "C"
+void gpuSetCustomExternalParameters(gpuContext gpu, const vector<int>& atomIndex, const vector<vector<double> >& atomParams,
+            const string& energyExp, const vector<string>& paramNames, const vector<string>& globalParamNames)
+{
+    if (paramNames.size() > 4)
+        throw OpenMMException("CudaPlatform only supports four per-particle parameters for custom external forces");
+    if (globalParamNames.size() > 8)
+        throw OpenMMException("CudaPlatform only supports eight global parameters for custom external forces");
+    if (gpu->psCustomExternalID != NULL)
+        throw OpenMMException("CudaPlatform only supports a single CustomExternalForce per System");
+    gpu->sim.customExternals = atomIndex.size();
+    gpu->sim.customExternalParameters = paramNames.size();
+    gpu->psCustomExternalID = new CUDAStream<int>(gpu->sim.customExternals, 1, "CustomExternalId");
+    gpu->sim.pCustomExternalID = gpu->psCustomExternalID->_pDevData;
+    gpu->psCustomExternalParams = new CUDAStream<float4>(gpu->sim.customExternals, 1, "CustomExternalParams");
+    gpu->sim.pCustomExternalParams = gpu->psCustomExternalParams->_pDevData;
+    for (int i = 0; i < (int) atomIndex.size(); i++) {
+        (*gpu->psCustomExternalID)[i] = atomIndex[i];
+        if (atomParams[i].size() > 0)
+            (*gpu->psCustomExternalParams)[i].x = (float) atomParams[i][0];
+        if (atomParams[i].size() > 1)
+            (*gpu->psCustomExternalParams)[i].y = (float) atomParams[i][1];
+        if (atomParams[i].size() > 2)
+            (*gpu->psCustomExternalParams)[i].z = (float) atomParams[i][2];
+        if (atomParams[i].size() > 3)
+            (*gpu->psCustomExternalParams)[i].w = (float) atomParams[i][3];
+    }
+    gpu->psCustomExternalID->Upload();
+    gpu->psCustomExternalParams->Upload();
+    // Create the Expressions.
+    vector<string> variables;
+    variables.push_back("x");
+    variables.push_back("y");
+    variables.push_back("z");
+    for (int i = 0; i < (int) paramNames.size(); i++)
+        variables.push_back(paramNames[i]);
+    SetCustomExternalEnergyExpression(createExpression<256>(gpu, energyExp, Lepton::Parser::parse(energyExp).optimize().createProgram(), variables, globalParamNames, gpu->sim.customExpressionStackSize));
+    SetCustomExternalForceExpressions(createExpression<256>(gpu, energyExp, Lepton::Parser::parse(energyExp).differentiate("x").optimize().createProgram(), variables, globalParamNames, gpu->sim.customExpressionStackSize),
+                                  createExpression<256>(gpu, energyExp, Lepton::Parser::parse(energyExp).differentiate("y").optimize().createProgram(), variables, globalParamNames, gpu->sim.customExpressionStackSize),
+                                  createExpression<256>(gpu, energyExp, Lepton::Parser::parse(energyExp).differentiate("z").optimize().createProgram(), variables, globalParamNames, gpu->sim.customExpressionStackSize));
+}
+extern "C"
+void gpuSetCustomNonbondedParameters(gpuContext gpu, const vector<vector<double> >& parameters, const vector<vector<int> >& exclusions,
+            CudaNonbondedMethod method, float cutoffDistance, const string& energyExp,
+            const vector<string>& paramNames, const vector<string>& globalParamNames)
+{
+    if (gpu->sim.nonbondedCutoff != 0.0f && gpu->sim.nonbondedCutoff != cutoffDistance)
+        throw OpenMMException("All nonbonded forces must use the same cutoff");
+    if (paramNames.size() > 4)
+        throw OpenMMException("CudaPlatform only supports four per-atom parameters for custom nonbonded forces");
+    if (globalParamNames.size() > 8)
+        throw OpenMMException("CudaPlatform only supports eight global parameters for custom nonbonded forces");
+    gpu->sim.nonbondedCutoff = cutoffDistance;
+    gpu->sim.nonbondedCutoffSqr = cutoffDistance*cutoffDistance;
+    gpu->sim.customNonbondedMethod = method;
+    gpu->sim.customParameters = paramNames.size();
+    setExclusions(gpu, exclusions);
+    gpu->psCustomParams = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, 1, "CustomParams");
+    gpu->sim.pCustomParams = gpu->psCustomParams->_pDevData;
+    for (int i = 0; i < (int) parameters.size(); i++) {
+        if (parameters[i].size() > 0)
+            (*gpu->psCustomParams)[i].x = (float) parameters[i][0];
+        if (parameters[i].size() > 1)
+            (*gpu->psCustomParams)[i].y = (float) parameters[i][1];
+        if (parameters[i].size() > 2)
+            (*gpu->psCustomParams)[i].z = (float) parameters[i][2];
+        if (parameters[i].size() > 3)
+            (*gpu->psCustomParams)[i].w = (float) parameters[i][3];
+    }
+    gpu->psCustomParams->Upload();
+    // This class serves as a placeholder for custom functions in expressions.
+    class FunctionPlaceholder : public Lepton::CustomFunction {
+    public:
+        int getNumArguments() const {
+            return 1;
+        }
+        double evaluate(const double* arguments) const {
+            return 0.0;
+        }
+        double evaluateDerivative(const double* arguments, const int* derivOrder) const {
+            return 0.0;
+        }
+        CustomFunction* clone() const {
+            return new FunctionPlaceholder();
+        }
+    };
+    // Record the tabulated functions, which were previously set with calls to gpuSetTabulatedFunction().
+    FunctionPlaceholder* fp = new FunctionPlaceholder();
+    map<string, Lepton::CustomFunction*> functions;
+    gpu->psTabulatedFunctionParams = new CUDAStream<float4>(MAX_TABULATED_FUNCTIONS, 1, "TabulatedFunctionRange");
+    gpu->sim.pTabulatedFunctionParams = gpu->psTabulatedFunctionParams->_pDevData;
+    for (int i = 0; i < MAX_TABULATED_FUNCTIONS; i++) {
+        gpuTabulatedFunction& func = gpu->tabulatedFunctions[i];
+        if (func.coefficients != NULL) {
+            (*gpu->psTabulatedFunctionParams)[i] = make_float4((float) func.min, (float) func.max, (float) (func.coefficients->_length/(func.max-func.min)), (float) (func.coefficients->_length-1));
+            functions[func.name] = fp;
+        }
+    }
+    gpu->psTabulatedFunctionParams->Upload();
+    // Create the Expressions.
+    vector<string> variables;
+    for (int j = 1; j < 3; j++) {
+        for (int i = 0; i < (int) paramNames.size(); i++) {
+            stringstream name;
+            name << paramNames[i] << j;
+            variables.push_back(name.str());
+        }
+        for (int i = paramNames.size(); i < 4; i++)
+            variables.push_back("");
+    }
+    variables.push_back("r");
+    SetCustomNonbondedEnergyExpression(createExpression<256>(gpu, energyExp, Lepton::Parser::parse(energyExp, functions).optimize().createProgram(), variables, globalParamNames, gpu->sim.customExpressionStackSize));
+    SetCustomNonbondedForceExpression(createExpression<256>(gpu, energyExp, Lepton::Parser::parse(energyExp, functions).differentiate("r").optimize().createProgram(), variables, globalParamNames, gpu->sim.customExpressionStackSize));
+    delete fp;
+}
+static void tabulateErfc(gpuContext gpu)
+{
+    int tableSize = 2048;
+    gpu->sim.tabulatedErfcSize = tableSize;
+    gpu->sim.tabulatedErfcScale = tableSize/(gpu->sim.alphaEwald*gpu->sim.nonbondedCutoff);
+    gpu->psTabulatedErfc = new CUDAStream<float>(tableSize, 1, "TabulatedErfc");
+    gpu->sim.pTabulatedErfc = gpu->psTabulatedErfc->_pDevData;
+    for (int i = 0; i < tableSize; ++i)
+        (*gpu->psTabulatedErfc)[i] = (float) erfc(i*(gpu->sim.alphaEwald*gpu->sim.nonbondedCutoff)/tableSize);
+    gpu->psTabulatedErfc->Upload();
+}
+extern "C"
+void gpuSetEwaldParameters(gpuContext gpu, float alpha, int kmaxx, int kmaxy, int kmaxz)
+{
+    gpu->sim.alphaEwald         = alpha;
+    gpu->sim.factorEwald        = -1 / (4*alpha*alpha);
+    gpu->sim.kmaxX              = kmaxx;
+    gpu->sim.kmaxY              = kmaxy;
+    gpu->sim.kmaxZ              = kmaxz;
+    gpu->psEwaldCosSinSum       = new CUDAStream<float2>((gpu->sim.kmaxX*2-1) * (gpu->sim.kmaxY*2-1) * (gpu->sim.kmaxZ*2-1), 1, "EwaldCosSinSum");
+    gpu->sim.pEwaldCosSinSum    = gpu->psEwaldCosSinSum->_pDevStream[0];
+    tabulateErfc(gpu);
+}
+extern "C"
+void gpuSetPMEParameters(gpuContext gpu, float alpha, int gridSizeX, int gridSizeY, int gridSizeZ)
+{
+    gpu->sim.alphaEwald         = alpha;
+    int3 gridSize = make_int3(gridSizeX, gridSizeY, gridSizeZ);
+    gpu->sim.pmeGridSize = gridSize;
+    int3 groupSize = make_int3(2, 4, 4);
+    gpu->sim.pmeGroupSize = groupSize;
+    const int3 numGroups = make_int3((gridSize.x+groupSize.x-1)/groupSize.x, (gridSize.y+groupSize.y-1)/groupSize.y, (gridSize.z+groupSize.z-1)/groupSize.z);
+    const unsigned int totalGroups = numGroups.x*numGroups.y*numGroups.z;
+    cufftPlan3d(&gpu->fftplan, gridSize.x, gridSize.y, gridSize.z, CUFFT_C2C);
+    gpu->psPmeGrid = new CUDAStream<cufftComplex>(gridSize.x*gridSize.y*gridSize.z, 1, "PmeGrid");
+    gpu->sim.pPmeGrid = gpu->psPmeGrid->_pDevData;
+    gpu->psPmeBsplineModuli[0] = new CUDAStream<float>(gridSize.x, 1, "PmeBsplineModuli0");
+    gpu->sim.pPmeBsplineModuli[0] = gpu->psPmeBsplineModuli[0]->_pDevData;
+    gpu->psPmeBsplineModuli[1] = new CUDAStream<float>(gridSize.y, 1, "PmeBsplineModuli1");
+    gpu->sim.pPmeBsplineModuli[1] = gpu->psPmeBsplineModuli[1]->_pDevData;
+    gpu->psPmeBsplineModuli[2] = new CUDAStream<float>(gridSize.z, 1, "PmeBsplineModuli2");
+    gpu->sim.pPmeBsplineModuli[2] = gpu->psPmeBsplineModuli[2]->_pDevData;
+    gpu->psPmeBsplineTheta = new CUDAStream<float4>(PME_ORDER*gpu->natoms, 1, "PmeBsplineTheta");
+    gpu->sim.pPmeBsplineTheta = gpu->psPmeBsplineTheta->_pDevData;
+    gpu->psPmeBsplineDtheta = new CUDAStream<float4>(PME_ORDER*gpu->natoms, 1, "PmeBsplineDtheta");
+    gpu->sim.pPmeBsplineDtheta = gpu->psPmeBsplineDtheta->_pDevData;
+    gpu->psPmeAtomRange = new CUDAStream<int>(gridSize.x*gridSize.y*gridSize.z+1, 1, "PmeAtomRange");
+    gpu->sim.pPmeAtomRange = gpu->psPmeAtomRange->_pDevData;
+    gpu->psPmeAtomGridIndex = new CUDAStream<int2>(gpu->natoms, 1, "PmeAtomGridIndex");
+    gpu->sim.pPmeAtomGridIndex = gpu->psPmeAtomGridIndex->_pDevData;
+    tabulateErfc(gpu);
+    // Initialize the b-spline moduli.
+    int maxSize = max(max(gridSize.x, gridSize.y), gridSize.z);
+    vector<double> data(PME_ORDER);
+    vector<double> ddata(PME_ORDER);
+    vector<double> bsplines_data(maxSize);
+    data[PME_ORDER-1] = 0.0;
+    data[1] = 0.0;
+    data[0] = 1.0;
+    for (int i = 3; i < PME_ORDER; i++)
+    {
+        double div = 1.0/(i-1.0);
+        data[i-1] = 0.0;
+        for (int j = 1; j < (i-1); j++)
+            data[i-j-1] = div*(j*data[i-j-2]+(i-j)*data[i-j-1]);
+        data[0] = div*data[0];
+    }
+    // Differentiate.
+    ddata[0] = -data[0];
+    for (int i = 1; i < PME_ORDER; i++)
+        ddata[i] = data[i-1]-data[i];
+    double div = 1.0/(PME_ORDER-1);
+    data[PME_ORDER-1] = 0.0;
+    for (int i = 1; i < (PME_ORDER-1); i++)
+        data[PME_ORDER-i-1] = div*(i*data[PME_ORDER-i-2]+(PME_ORDER-i)*data[PME_ORDER-i-1]);
+    data[0] = div*data[0];
+    for (int i = 0; i < maxSize; i++)
+        bsplines_data[i] = 0.0;
+    for (int i = 1; i <= PME_ORDER; i++)
+        bsplines_data[i] = data[i-1];
+    // Evaluate the actual bspline moduli for X/Y/Z.
+    for(int dim = 0; dim < 3; dim++)
+    {
+        int ndata = (dim == 0 ? gridSize.x : dim == 1 ? gridSize.y : gridSize.z);
+        for (int i = 0; i < ndata; i++)
+        {
+            double sc = 0.0;
+            double ss = 0.0;
+            for (int j = 0; j < ndata; j++)
+            {
+                double arg = (2.0*M_PI*i*j)/ndata;
+                sc += bsplines_data[j]*cos(arg);
+                ss += bsplines_data[j]*sin(arg);
+            }
+            (*gpu->psPmeBsplineModuli[dim])[i] = (float) (sc*sc+ss*ss);
+        }
+        for (int i = 0; i < ndata; i++)
+        {
+            if ((*gpu->psPmeBsplineModuli[dim])[i] < 1.0e-7)
+                (*gpu->psPmeBsplineModuli[dim])[i] = ((*gpu->psPmeBsplineModuli[dim])[i-1]+(*gpu->psPmeBsplineModuli[dim])[i+1])*0.5f;
+        }
+        gpu->psPmeBsplineModuli[dim]->Upload();
+    }
+}
+extern "C"
+void gpuSetPeriodicBoxSize(gpuContext gpu, float xsize, float ysize, float zsize)
+{
+    gpu->sim.periodicBoxSizeX = xsize;
+    gpu->sim.periodicBoxSizeY = ysize;
+    gpu->sim.periodicBoxSizeZ = zsize;
+    gpu->sim.invPeriodicBoxSizeX = 1.0f/xsize;
+    gpu->sim.invPeriodicBoxSizeY = 1.0f/ysize;
+    gpu->sim.invPeriodicBoxSizeZ = 1.0f/zsize;
+    gpu->sim.recipBoxSizeX = 2.0f*PI/gpu->sim.periodicBoxSizeX;
+    gpu->sim.recipBoxSizeY = 2.0f*PI/gpu->sim.periodicBoxSizeY;
+    gpu->sim.recipBoxSizeZ = 2.0f*PI/gpu->sim.periodicBoxSizeZ;
+    gpu->sim.cellVolume = gpu->sim.periodicBoxSizeX*gpu->sim.periodicBoxSizeY*gpu->sim.periodicBoxSizeZ;
+}
+extern "C"
+void gpuSetObcParameters(gpuContext gpu, float innerDielectric, float solventDielectric, const vector<float>& radius, const vector<float>& scale, const vector<float>& charge)
+{
+    unsigned int atoms = radius.size();
+    gpu->bIncludeGBSA = true;
+    for (unsigned int i = 0; i < atoms; i++)
+    {
+            (*gpu->psObcData)[i].x = radius[i] - dielectricOffset;
+            (*gpu->psObcData)[i].y = scale[i] * (*gpu->psObcData)[i].x;
+            (*gpu->psPosq4)[i].w = charge[i];
+#if (DUMP_PARAMETERS == 1)
+        cout << 
+            i << " " << 
+            (*gpu->psObcData)[i].x << " " <<
+            (*gpu->psObcData)[i].y;
+#endif
+    }
+    // Dummy out extra atom data
+    for (unsigned int i = atoms; i < gpu->sim.paddedNumberOfAtoms; i++)
+    {
+        (*gpu->psBornRadii)[i]     = 0.2f;
+        (*gpu->psObcData)[i].x     = 0.01f;
+        (*gpu->psObcData)[i].y     = 0.01f;
+    }
+    gpu->psBornRadii->Upload();
+    gpu->psObcData->Upload();
+    gpu->psPosq4->Upload();
+    gpu->sim.preFactor = 2.0f*electricConstant*((1.0f/innerDielectric)-(1.0f/solventDielectric))*gpu->sim.forceConversionFactor;
+}
+extern "C"
+void gpuSetGBVIParameters(gpuContext gpu, float innerDielectric, float solventDielectric, const vector<int>& atom, const vector<float>& radius, 
+                          const vector<float>& gamma, const vector<float>& scaledRadii, int bornRadiusScalingMethod, float quinticLowerLimitFactor,
+                          float quinticUpperBornRadiusLimit )
+{
+    unsigned int atoms = atom.size();
+    gpu->bIncludeGBVI  = true;
+    double tau         = ((1.0f/innerDielectric)-(1.0f/solventDielectric)); 
+    gpu->sim.gbviQuinticLowerLimitFactor     = quinticLowerLimitFactor;
+    gpu->sim.gbviQuinticUpperBornRadiusLimit = quinticUpperBornRadiusLimit;
+    gpu->sim.gbviBornRadiusScalingMethod     = bornRadiusScalingMethod; 
+    for (unsigned int i = 0; i < atoms; i++)
+    {
+            (*gpu->psGBVIData)[i].x            = radius[i];
+            (*gpu->psGBVIData)[i].y            = scaledRadii[i];
+            (*gpu->psGBVIData)[i].z            = (float) (tau*gamma[i]);
+            (*gpu->psGBVIData)[i].w            = 1.0f;
+            (*gpu->psGBVISwitchDerivative)[i]  = 1.0f;
+(*gpu->psObcData)[i].x  = radius[i];
+(*gpu->psObcData)[i].y  = 0.9f*radius[i];
+#undef DUMP_PARAMETERS
+#define DUMP_PARAMETERS 0
+#if (DUMP_PARAMETERS == 1)
+        (void) fprintf( stderr,"GBVI param: %5u R=%14.7e scaledR=%14.7e gamma*tau=%14.7e bornRadiusScaleFactor=%14.7e\n",
+                        i, (*gpu->psGBVIData)[i].x, (*gpu->psGBVIData)[i].y,
+                        (*gpu->psGBVIData)[i].z, (*gpu->psGBVIData)[i].w ); 
+#endif
+    }
+//(void) fprintf( stderr, "gpuSetGBVIParameters: setting Obc parameters!!!! should be removed.\n" );
+    // Dummy out extra atom data
+    for (unsigned int i = atoms; i < gpu->sim.paddedNumberOfAtoms; i++)
+    {
+        (*gpu->psBornRadii)[i]              = 0.2f;
+        (*gpu->psGBVIData)[i].x             = 0.01f;
+        (*gpu->psGBVIData)[i].y             = 0.01f;
+        (*gpu->psGBVIData)[i].z             = 0.01f;
+        (*gpu->psGBVIData)[i].w             = 1.00f;
+        (*gpu->psGBVISwitchDerivative)[i]   = 1.0f;
+    }
+    gpu->psBornRadii->Upload();
+    gpu->psGBVIData->Upload();
+    gpu->psGBVISwitchDerivative->Upload();
+gpu->psObcData->Upload();
+    gpu->sim.preFactor = 2.0f*electricConstant*((1.0f/innerDielectric)-(1.0f/solventDielectric))*gpu->sim.forceConversionFactor;
+#if (DUMP_PARAMETERS == 1)
+(void) fprintf( stderr, "gpuSetGBVIParameters: preFactor=%14.6e elecCnstnt=%.4f frcCnvrsnFctr=%.4f tau=%.4f.\n",
+                gpu->sim.preFactor, 2.0f*electricConstant, gpu->sim.forceConversionFactor, ((1.0f/innerDielectric)-(1.0f/solventDielectric)) );
+#endif
+}
+static void markShakeClusterInvalid(ShakeCluster& cluster, map<int, ShakeCluster>& allClusters, vector<bool>& invalidForShake)
+{
+    cluster.valid = false;
+    invalidForShake[cluster.centralID] = true;
+    for (int i = 0; i < cluster.size; i++) {
+        invalidForShake[cluster.peripheralID[i]] = true;
+        map<int, ShakeCluster>::iterator otherCluster = allClusters.find(cluster.peripheralID[i]);
+        if (otherCluster != allClusters.end() && otherCluster->second.valid)
+            markShakeClusterInvalid(otherCluster->second, allClusters, invalidForShake);
+    }
+}
+extern "C"
+void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const vector<int>& atom2, const vector<float>& distance,
+        const vector<float>& invMass1, const vector<float>& invMass2, float constraintTolerance)
+{
+    // Create a vector for recording which atoms are handled by SHAKE (or SETTLE).
+    vector<bool> isShakeAtom(gpu->natoms, false);
+    // Find how many constraints each atom is involved in.
+    vector<int> constraintCount(gpu->natoms, 0);
+    for (int i = 0; i < (int)atom1.size(); i++) {
+        constraintCount[atom1[i]]++;
+        constraintCount[atom2[i]]++;
+    }
+    // Identify clusters of three atoms that can be treated with SETTLE.  First, for every
+    // atom that might be part of such a cluster, make a list of the two other atoms it is
+    // connected to.
+    vector<map<int, float> > settleConstraints(gpu->natoms);
+    for (int i = 0; i < (int)atom1.size(); i++) {
+        if (constraintCount[atom1[i]] == 2 && constraintCount[atom2[i]] == 2) {
+            settleConstraints[atom1[i]][atom2[i]] = distance[i];
+            settleConstraints[atom2[i]][atom1[i]] = distance[i];
+        }
+    }
+    // Now remove the ones that don't actually form closed loops of three atoms.
+    vector<int> settleClusters;
+    for (int i = 0; i < (int)settleConstraints.size(); i++) {
+        if (settleConstraints[i].size() == 2) {
+            int partner1 = settleConstraints[i].begin()->first;
+            int partner2 = (++settleConstraints[i].begin())->first;
+            if (settleConstraints[partner1].size() != 2 || settleConstraints[partner2].size() != 2 ||
+                    settleConstraints[partner1].find(partner2) == settleConstraints[partner1].end())
+                settleConstraints[i].clear();
+            else if (i < partner1 && i < partner2)
+                settleClusters.push_back(i);
+        }
+        else
+            settleConstraints[i].clear();
+    }
+    // Record the actual SETTLE clusters.
+    CUDAStream<int4>* psSettleID          = new CUDAStream<int4>((int) settleClusters.size(), 1, "SettleID");
+    gpu->psSettleID                       = psSettleID;
+    gpu->sim.pSettleID                    = psSettleID->_pDevStream[0];
+    CUDAStream<float2>* psSettleParameter = new CUDAStream<float2>((int) settleClusters.size(), 1, "SettleParameter");
+    gpu->psSettleParameter                = psSettleParameter;
+    gpu->sim.pSettleParameter             = psSettleParameter->_pDevStream[0];
+    gpu->sim.settleConstraints            = settleClusters.size();
+      for (int i = 0; i < (int)settleClusters.size(); i++) {
+        int atom1 = settleClusters[i];
+        int atom2 = settleConstraints[atom1].begin()->first;
+        int atom3 = (++settleConstraints[atom1].begin())->first;
+        float dist12 = settleConstraints[atom1].find(atom2)->second;
+        float dist13 = settleConstraints[atom1].find(atom3)->second;
+        float dist23 = settleConstraints[atom2].find(atom3)->second;
+        if (dist12 == dist13) { // atom1 is the central atom
+            (*psSettleID)[i].x = atom1;
+            (*psSettleID)[i].y = atom2;
+            (*psSettleID)[i].z = atom3;
+            (*psSettleParameter)[i].x = dist12;
+            (*psSettleParameter)[i].y = dist23;
+        }
+        else if (dist12 == dist23) { // atom2 is the central atom
+            (*psSettleID)[i].x = atom2;
+            (*psSettleID)[i].y = atom1;
+            (*psSettleID)[i].z = atom3;
+            (*psSettleParameter)[i].x = dist12;
+            (*psSettleParameter)[i].y = dist13;
+        }
+        else if (dist13 == dist23) { // atom3 is the central atom
+            (*psSettleID)[i].x = atom3;
+            (*psSettleID)[i].y = atom1;
+            (*psSettleID)[i].z = atom2;
+            (*psSettleParameter)[i].x = dist13;
+            (*psSettleParameter)[i].y = dist12;
+        }
+        else
+            throw OpenMMException("Two of the three distances constrained with SETTLE must be the same.");
+        isShakeAtom[atom1] = true;
+        isShakeAtom[atom2] = true;
+        isShakeAtom[atom3] = true;
+    }
+    psSettleID->Upload();
+    psSettleParameter->Upload();
+    gpu->sim.settle_threads_per_block     = (gpu->sim.settleConstraints + gpu->sim.blocks - 1) / gpu->sim.blocks;
+    if (gpu->sim.settle_threads_per_block > gpu->sim.max_shake_threads_per_block)
+        gpu->sim.settle_threads_per_block = gpu->sim.max_shake_threads_per_block;
+    if (gpu->sim.settle_threads_per_block < 1)
+        gpu->sim.settle_threads_per_block = 1;
+    // Find clusters consisting of a central atom with up to three peripheral atoms.
+    map<int, ShakeCluster> clusters;
+    vector<bool> invalidForShake(gpu->natoms, false);
+    for (int i = 0; i < (int)atom1.size(); i++) {
+        if (isShakeAtom[atom1[i]])
+            continue; // This is being taken care of with SETTLE.
+        // Determine which is the central atom.
+        bool firstIsCentral;
+        if (constraintCount[atom1[i]] > 1)
+            firstIsCentral = true;
+        else if (constraintCount[atom2[i]] > 1)
+            firstIsCentral = false;
+        else if (atom1[i] < atom2[i])
+            firstIsCentral = true;
+        else
+            firstIsCentral = false;
+        int centralID, peripheralID;
+        float centralInvMass, peripheralInvMass;
+        if (firstIsCentral) {
+            centralID = atom1[i];
+            peripheralID = atom2[i];
+            centralInvMass = invMass1[i];
+            peripheralInvMass = invMass2[i];
+        }
+        else {
+            centralID = atom2[i];
+            peripheralID = atom1[i];
+            centralInvMass = invMass2[i];
+            peripheralInvMass = invMass1[i];
+        }
+        // Add it to the cluster.
+        if (clusters.find(centralID) == clusters.end()) {
+            clusters[centralID] = ShakeCluster(centralID, centralInvMass);
+        }
+        ShakeCluster& cluster = clusters[centralID];
+        cluster.addAtom(peripheralID, distance[i], peripheralInvMass);
+        if (constraintCount[peripheralID] != 1 || invalidForShake[atom1[i]] || invalidForShake[atom2[i]]) {
+            markShakeClusterInvalid(cluster, clusters, invalidForShake);
+            map<int, ShakeCluster>::iterator otherCluster = clusters.find(peripheralID);
+            if (otherCluster != clusters.end() && otherCluster->second.valid)
+                markShakeClusterInvalid(otherCluster->second, clusters, invalidForShake);
+        }
+    }
+    int validShakeClusters = 0;
+    for (map<int, ShakeCluster>::iterator iter = clusters.begin(); iter != clusters.end(); ++iter) {
+        ShakeCluster& cluster = iter->second;
+        if (cluster.valid) {
+            cluster.valid = !invalidForShake[cluster.centralID];
+            for (int i = 0; i < cluster.size; i++)
+                if (invalidForShake[cluster.peripheralID[i]])
+                    cluster.valid = false;
+            if (cluster.valid)
+                ++validShakeClusters;
+        }
+    }
+    // Fill in the Cuda streams.
+    CUDAStream<int4>* psShakeID             = new CUDAStream<int4>(validShakeClusters, 1, "ShakeID");
+    gpu->psShakeID                          = psShakeID;
+    gpu->sim.pShakeID                       = psShakeID->_pDevStream[0];
+    CUDAStream<float4>* psShakeParameter    = new CUDAStream<float4>(validShakeClusters, 1, "ShakeParameter");
+    gpu->psShakeParameter                   = psShakeParameter;
+    gpu->sim.pShakeParameter                = psShakeParameter->_pDevStream[0];
+    gpu->sim.ShakeConstraints               = validShakeClusters;
+    int index = 0;
+    for (map<int, ShakeCluster>::const_iterator iter = clusters.begin(); iter != clusters.end(); ++iter) {
+        const ShakeCluster& cluster = iter->second;
+        if (!cluster.valid)
+            continue;
+        (*psShakeID)[index].x = cluster.centralID;
+        (*psShakeID)[index].y = cluster.peripheralID[0];
+        (*psShakeID)[index].z = cluster.size > 1 ? cluster.peripheralID[1] : -1;
+        (*psShakeID)[index].w = cluster.size > 2 ? cluster.peripheralID[2] : -1;
+        (*psShakeParameter)[index].x = cluster.centralInvMass;
+        (*psShakeParameter)[index].y = 0.5f/(cluster.centralInvMass+cluster.peripheralInvMass);
+        (*psShakeParameter)[index].z = cluster.distance*cluster.distance;
+        (*psShakeParameter)[index].w = cluster.peripheralInvMass;
+        isShakeAtom[cluster.centralID] = true;
+        isShakeAtom[cluster.peripheralID[0]] = true;
+        if (cluster.size > 1)
+            isShakeAtom[cluster.peripheralID[1]] = true;
+        if (cluster.size > 2)
+            isShakeAtom[cluster.peripheralID[2]] = true;
+        ++index;
+    }
+    psShakeID->Upload();
+    psShakeParameter->Upload();
+    gpu->sim.shakeTolerance = constraintTolerance;
+    gpu->sim.shake_threads_per_block     = (gpu->sim.ShakeConstraints + gpu->sim.blocks - 1) / gpu->sim.blocks;
+    if (gpu->sim.shake_threads_per_block > gpu->sim.max_shake_threads_per_block)
+        gpu->sim.shake_threads_per_block = gpu->sim.max_shake_threads_per_block;
+    if (gpu->sim.shake_threads_per_block < 1)
+        gpu->sim.shake_threads_per_block = 1;
+    // Find connected constraints for CCMA.
+    vector<int> ccmaConstraints;
+    for (unsigned i = 0; i < atom1.size(); i++)
+        if (!isShakeAtom[atom1[i]])
+            ccmaConstraints.push_back(i);
+    // Record the connections between constraints.
+    int numCCMA = (int) ccmaConstraints.size();
+    vector<vector<int> > atomConstraints(gpu->natoms);
+    for (int i = 0; i < numCCMA; i++) {
+        atomConstraints[atom1[ccmaConstraints[i]]].push_back(i);
+        atomConstraints[atom2[ccmaConstraints[i]]].push_back(i);
+    }
+    vector<vector<int> > linkedConstraints(numCCMA);
+    for (unsigned atom = 0; atom < atomConstraints.size(); atom++) {
+        for (unsigned i = 0; i < atomConstraints[atom].size(); i++)
+            for (unsigned j = 0; j < i; j++) {
+                int c1 = atomConstraints[atom][i];
+                int c2 = atomConstraints[atom][j];
+                linkedConstraints[c1].push_back(c2);
+                linkedConstraints[c2].push_back(c1);
+            }
+    }
+    int maxLinks = 0;
+    for (unsigned i = 0; i < linkedConstraints.size(); i++)
+        maxLinks = max(maxLinks, (int) linkedConstraints[i].size());
+    int maxAtomConstraints = 0;
+    for (unsigned i = 0; i < atomConstraints.size(); i++)
+        maxAtomConstraints = max(maxAtomConstraints, (int) atomConstraints[i].size());
+    // Compute the constraint coupling matrix
+    vector<vector<int> > atomAngles(gpu->natoms);
+    for (int i = 0; i < (int) gpu->sim.bond_angles; i++)
+        atomAngles[(*gpu->psBondAngleID1)[i].y].push_back(i);
+    vector<vector<pair<int, double> > > matrix(numCCMA);
+    if (numCCMA > 0) {
+        for (int j = 0; j < numCCMA; j++) {
+            for (int k = 0; k < numCCMA; k++) {
+                if (j == k) {
+                    matrix[j].push_back(pair<int, double>(j, 1.0));
+                    continue;
+                }
+                double scale;
+                int cj = ccmaConstraints[j];
+                int ck = ccmaConstraints[k];
+                int atomj0 = atom1[cj];
+                int atomj1 = atom2[cj];
+                int atomk0 = atom1[ck];
+                int atomk1 = atom2[ck];
+                int atoma, atomb, atomc;
+                if (atomj0 == atomk0) {
+                    atoma = atomj1;
+                    atomb = atomj0;
+                    atomc = atomk1;
+                    scale = invMass1[cj]/(invMass1[cj]+invMass2[cj]);
+                }
+                else if (atomj1 == atomk1) {
+                    atoma = atomj0;
+                    atomb = atomj1;
+                    atomc = atomk0;
+                    scale = invMass2[cj]/(invMass1[cj]+invMass2[cj]);
+                }
+                else if (atomj0 == atomk1) {
+                    atoma = atomj1;
+                    atomb = atomj0;
+                    atomc = atomk0;
+                    scale = invMass1[cj]/(invMass1[cj]+invMass2[cj]);
+                }
+                else if (atomj1 == atomk0) {
+                    atoma = atomj0;
+                    atomb = atomj1;
+                    atomc = atomk1;
+                    scale = invMass2[cj]/(invMass1[cj]+invMass2[cj]);
+                }
+                else
+                    continue; // These constraints are not connected.
+                // Look for a third constraint forming a triangle with these two.
+                bool foundConstraint = false;
+                for (int other = 0; other < numCCMA; other++) {
+                    if ((atom1[other] == atoma && atom2[other] == atomc) || (atom1[other] == atomc && atom2[other] == atoma)) {
+                        double d1 = distance[cj];
+                        double d2 = distance[ck];
+                        double d3 = distance[other];
+                        matrix[j].push_back(pair<int, double>(k, scale*(d1*d1+d2*d2-d3*d3)/(2.0*d1*d2)));
+                        foundConstraint = true;
+                        break;
+                    }
+                }
+                if (!foundConstraint) {
+                    // We didn't find one, so look for an angle force field term.
+                    const vector<int>& angleCandidates = atomAngles[atomb];
+                    for (vector<int>::const_iterator iter = angleCandidates.begin(); iter != angleCandidates.end(); iter++) {
+                        int4 atoms = (*gpu->psBondAngleID1)[*iter];
+                        if ((atoms.x == atoma && atoms.z == atomc) || (atoms.z == atoma && atoms.x == atomc)) {
+                            double angle = (*gpu->psBondAngleParameter)[*iter].x;
+                            matrix[j].push_back(pair<int, double>(k, scale*cos(angle*PI/180.0)));
+                            break;
+                        }
+                    }
+                }
+            }
+        }
+        // Invert it using QR.
+        vector<int> matrixRowStart;
+        vector<int> matrixColIndex;
+        vector<double> matrixValue;
+        for (int i = 0; i < numCCMA; i++) {
+            matrixRowStart.push_back(matrixValue.size());
+            for (int j = 0; j < (int) matrix[i].size(); j++) {
+                pair<int, double> element = matrix[i][j];
+                matrixColIndex.push_back(element.first);
+                matrixValue.push_back(element.second);
+            }
+        }
+        matrixRowStart.push_back(matrixValue.size());
+        int *qRowStart, *qColIndex, *rRowStart, *rColIndex;
+        double *qValue, *rValue;
+        int result = QUERN_compute_qr(numCCMA, numCCMA, &matrixRowStart[0], &matrixColIndex[0], &matrixValue[0], NULL,
+                &qRowStart, &qColIndex, &qValue, &rRowStart, &rColIndex, &rValue);
+        vector<double> rhs(numCCMA);
+        matrix.clear();
+        matrix.resize(numCCMA);
+        for (int i = 0; i < numCCMA; i++) {
+            // Extract column i of the inverse matrix.
+            for (int j = 0; j < numCCMA; j++)
+                rhs[j] = (i == j ? 1.0 : 0.0);
+            result = QUERN_multiply_with_q_transpose(numCCMA, qRowStart, qColIndex, qValue, &rhs[0]);
+            result = QUERN_solve_with_r(numCCMA, rRowStart, rColIndex, rValue, &rhs[0], &rhs[0]);
+            for (int j = 0; j < numCCMA; j++) {
+                double value = rhs[j]*distance[ccmaConstraints[i]]/distance[ccmaConstraints[j]];
+                if (abs(value) > 0.05)
+                    matrix[j].push_back(pair<int, double>(i, value));
+            }
+        }
+        QUERN_free_result(qRowStart, qColIndex, qValue);
+        QUERN_free_result(rRowStart, rColIndex, rValue);
+    }
+    int maxRowElements = 0;
+    for (unsigned i = 0; i < matrix.size(); i++)
+        maxRowElements = max(maxRowElements, (int) matrix[i].size());
+    maxRowElements++;
+    // Sort the constraints.
+    vector<int> constraintOrder(numCCMA);
+    for (int i = 0; i < numCCMA; ++i)
+        constraintOrder[i] = i;
+    sort(constraintOrder.begin(), constraintOrder.end(), ConstraintOrderer(atom1, atom2));
+    vector<int> inverseOrder(numCCMA);
+    for (int i = 0; i < numCCMA; ++i)
+        inverseOrder[constraintOrder[i]] = i;
+    for (int i = 0; i < (int)matrix.size(); ++i)
+        for (int j = 0; j < (int)matrix[i].size(); ++j)
+            matrix[i][j].first = inverseOrder[matrix[i][j].first];
+    // Fill in the CUDA streams.
+    CUDAStream<int2>* psCcmaAtoms = new CUDAStream<int2>(numCCMA, 1, "CcmaAtoms");
+    gpu->psCcmaAtoms              = psCcmaAtoms;
+    gpu->sim.pCcmaAtoms           = psCcmaAtoms->_pDevData;
+    CUDAStream<float4>* psCcmaDistance = new CUDAStream<float4>(numCCMA, 1, "CcmaDistance");
+    gpu->psCcmaDistance                = psCcmaDistance;
+    gpu->sim.pCcmaDistance             = psCcmaDistance->_pDevData;
+    CUDAStream<int>* psCcmaAtomConstraints = new CUDAStream<int>(gpu->natoms*maxAtomConstraints, 1, "CcmaAtomConstraints");
+    gpu->psCcmaAtomConstraints             = psCcmaAtomConstraints;
+    gpu->sim.pCcmaAtomConstraints          = psCcmaAtomConstraints->_pDevData;
+    CUDAStream<int>* psCcmaNumAtomConstraints = new CUDAStream<int>(gpu->natoms, 1, "CcmaAtomConstraintsIndex");
+    gpu->psCcmaNumAtomConstraints             = psCcmaNumAtomConstraints;
+    gpu->sim.pCcmaNumAtomConstraints          = psCcmaNumAtomConstraints->_pDevData;
+    CUDAStream<float>* psCcmaDelta1 = new CUDAStream<float>(numCCMA, 1, "CcmaDelta1");
+    gpu->psCcmaDelta1             = psCcmaDelta1;
+    gpu->sim.pCcmaDelta1          = psCcmaDelta1->_pDevData;
+    CUDAStream<float>* psCcmaDelta2 = new CUDAStream<float>(numCCMA, 1, "CcmaDelta2");
+    gpu->psCcmaDelta2             = psCcmaDelta2;
+    gpu->sim.pCcmaDelta2          = psCcmaDelta2->_pDevData;
+    CUDAStream<float>* psCcmaReducedMass = new CUDAStream<float>(numCCMA, 1, "CcmaReducedMass");
+    gpu->psCcmaReducedMass             = psCcmaReducedMass;
+    gpu->sim.pCcmaReducedMass          = psCcmaReducedMass->_pDevData;
+    CUDAStream<unsigned int>* psConstraintMatrixColumn = new CUDAStream<unsigned int>(numCCMA*maxRowElements, 1, "ConstraintMatrixColumn");
+    gpu->psConstraintMatrixColumn               = psConstraintMatrixColumn;
+    gpu->sim.pConstraintMatrixColumn            = psConstraintMatrixColumn->_pDevData;
+    CUDAStream<float>* psConstraintMatrixValue = new CUDAStream<float>(numCCMA*maxRowElements, 1, "ConstraintMatrixValue");
+    gpu->psConstraintMatrixValue             = psConstraintMatrixValue;
+    gpu->sim.pConstraintMatrixValue          = psConstraintMatrixValue->_pDevData;
+    cudaHostAlloc((void**) &gpu->ccmaConvergedHostMarker, sizeof(int), cudaHostAllocMapped);
+    cudaHostGetDevicePointer((void**) &gpu->sim.ccmaConvergedDeviceMarker, (void*) gpu->ccmaConvergedHostMarker, 0);
+    cudaEventCreate(&gpu->ccmaEvent);
+    gpu->sim.ccmaConstraints = numCCMA;
+    for (int i = 0; i < numCCMA; i++) {
+        int index = constraintOrder[i];
+        int c = ccmaConstraints[index];
+        (*psCcmaAtoms)[i].x = atom1[c];
+        (*psCcmaAtoms)[i].y = atom2[c];
+        (*psCcmaDistance)[i].w = distance[c];
+        (*psCcmaReducedMass)[i] = 0.5f/(invMass1[c]+invMass2[c]);
+        for (unsigned int j = 0; j < matrix[index].size(); j++) {
+            (*psConstraintMatrixColumn)[i+j*numCCMA] = matrix[index][j].first;
+            (*psConstraintMatrixValue)[i+j*numCCMA] = (float) matrix[index][j].second;
+        }
+        (*psConstraintMatrixColumn)[i+matrix[index].size()*numCCMA] = numCCMA;
+    }
+    for (unsigned int i = 0; i < atomConstraints.size(); i++) {
+        (*psCcmaNumAtomConstraints)[i] = atomConstraints[i].size();
+        for (unsigned int j = 0; j < atomConstraints[i].size(); j++) {
+            bool forward = (atom1[ccmaConstraints[atomConstraints[i][j]]] == i);
+            (*psCcmaAtomConstraints)[i+j*gpu->natoms] = (forward ? inverseOrder[atomConstraints[i][j]]+1 : -inverseOrder[atomConstraints[i][j]]-1);
+        }
+    }
+    psCcmaAtoms->Upload();
+    psCcmaDistance->Upload();
+    psCcmaReducedMass->Upload();
+    psCcmaAtomConstraints->Upload();
+    psCcmaNumAtomConstraints->Upload();
+    psConstraintMatrixColumn->Upload();
+    psConstraintMatrixValue->Upload();
+    gpu->sim.ccma_threads_per_block = (gpu->sim.ccmaConstraints + gpu->sim.blocks - 1) / gpu->sim.blocks;
+    if (gpu->sim.ccma_threads_per_block > gpu->sim.threads_per_block)
+        gpu->sim.ccma_threads_per_block = gpu->sim.threads_per_block;
+    if (gpu->sim.ccma_threads_per_block < gpu->sim.blocks)
+        gpu->sim.ccma_threads_per_block = gpu->sim.blocks;
+}
+extern "C"
+int gpuAllocateInitialBuffers(gpuContext gpu)
+{
+    gpu->sim.atoms                      = gpu->natoms;
+    gpu->sim.paddedNumberOfAtoms        = ((gpu->sim.atoms + GRID - 1) >> GRIDBITS) << GRIDBITS;
+    gpu->sim.degreesOfFreedom           = 3 * gpu->sim.atoms - 6;
+    gpu->gpAtomTable                    = NULL;
+    gpu->gAtomTypes                     = 0;
+    gpu->psPosq4                        = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, 1, "Posq");
+    gpu->sim.stride                     = gpu->psPosq4->_stride;
+    gpu->sim.stride2                    = gpu->sim.stride * 2;
+    gpu->sim.stride3                    = gpu->sim.stride * 3;
+    gpu->sim.stride4                    = gpu->sim.stride * 4;
+    gpu->sim.pPosq                      = gpu->psPosq4->_pDevStream[0];
+    gpu->sim.stride                     = gpu->psPosq4->_stride;
+    gpu->sim.stride2                    = 2 * gpu->sim.stride;
+    gpu->sim.stride3                    = 3 * gpu->sim.stride;
+    gpu->sim.stride4                    = 4 * gpu->sim.stride;
+    gpu->psPosqP4                       = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, 1, "PosqP");
+    gpu->sim.pPosqP                     = gpu->psPosqP4->_pDevStream[0];
+    gpu->psOldPosq4                     = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, 1, "OldPosq");
+    gpu->sim.pOldPosq                   = gpu->psOldPosq4->_pDevStream[0];
+    gpu->psVelm4                        = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, 1, "Velm");
+    gpu->sim.pVelm4                     = gpu->psVelm4->_pDevStream[0];
+    gpu->psBornRadii                    = new CUDAStream<float>(gpu->sim.paddedNumberOfAtoms, 1, "BornRadii");
+    gpu->sim.pBornRadii                 = gpu->psBornRadii->_pDevStream[0];
+    gpu->psObcChain                     = new CUDAStream<float>(gpu->sim.paddedNumberOfAtoms, 1, "ObcChain");
+    gpu->sim.pObcChain                  = gpu->psObcChain->_pDevStream[0];
+    gpu->psSigEps2                      = new CUDAStream<float2>(gpu->sim.paddedNumberOfAtoms, 1, "SigEps2");
+    gpu->sim.pAttr                      = gpu->psSigEps2->_pDevStream[0];
+    gpu->psObcData                      = new CUDAStream<float2>(gpu->sim.paddedNumberOfAtoms, 1, "ObcData");
+    gpu->sim.pObcData                   = gpu->psObcData->_pDevStream[0];
+    gpu->psGBVIData                     = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, 1, "GBVIData");
+    gpu->sim.pGBVIData                  = gpu->psGBVIData->_pDevStream[0];
+    gpu->psGBVISwitchDerivative        = new CUDAStream<float>(gpu->sim.paddedNumberOfAtoms, 1, "psGBVISwitchDerivative");
+    gpu->sim.pGBVISwitchDerivative      = gpu->psGBVISwitchDerivative->_pDevStream[0];
+    gpu->psStepSize                     = new CUDAStream<float2>(1, 1, "StepSize");
+    gpu->sim.pStepSize                  = gpu->psStepSize->_pDevStream[0];
+    (*gpu->psStepSize)[0] = make_float2(0.0f, 0.0f);
+    gpu->psStepSize->Upload();
+    gpu->psLangevinParameters           = new CUDAStream<float>(3, 1, "LangevinParameters");
+    gpu->sim.pLangevinParameters        = gpu->psLangevinParameters->_pDevStream[0];
+    gpu->pAtomSymbol                    = new unsigned char[gpu->natoms];
+    gpu->psAtomIndex                    = new CUDAStream<int>(gpu->sim.paddedNumberOfAtoms, 1, "AtomIndex");
+    gpu->sim.pAtomIndex                 = gpu->psAtomIndex->_pDevStream[0];
+    for (int i = 0; i < (int) gpu->sim.paddedNumberOfAtoms; i++)
+        (*gpu->psAtomIndex)[i] = i;
+    gpu->psAtomIndex->Upload();
+    gpu->posCellOffsets.resize(gpu->natoms, make_int3(0, 0, 0));
+    gpu->sim.outputBuffers = 0;
+    // Determine randoms
+    gpu->seed                           = 1;
+    gpu->sim.randomFrames               = 20;
+    gpu->sim.randomIterations           = gpu->sim.randomFrames;
+    gpu->sim.randoms                    = gpu->sim.randomFrames * gpu->sim.paddedNumberOfAtoms;
+    gpu->sim.totalRandoms               = gpu->sim.randoms + gpu->sim.paddedNumberOfAtoms;
+    gpu->psRandom4                      = new CUDAStream<float4>(gpu->sim.totalRandoms, 1, "Random4");
+    gpu->psRandom2                      = new CUDAStream<float2>(gpu->sim.totalRandoms, 1, "Random2");
+    gpu->psRandomPosition               = new CUDAStream<int>(gpu->sim.blocks, 1, "RandomPosition");
+    gpu->psRandomSeed                   = new CUDAStream<uint4>(gpu->sim.blocks * gpu->sim.random_threads_per_block, 1, "RandomSeed");
+    gpu->sim.pRandom4                   = gpu->psRandom4->_pDevStream[0];
+    gpu->sim.pRandom2                   = gpu->psRandom2->_pDevStream[0];
+    gpu->sim.pRandomPosition            = gpu->psRandomPosition->_pDevStream[0];
+    gpu->sim.pRandomSeed                = gpu->psRandomSeed->_pDevStream[0];
+    // Allocate and clear linear momentum buffer
+    gpu->psLinearMomentum = new CUDAStream<float4>(gpu->sim.blocks, 1, "LinearMomentum");
+    gpu->sim.pLinearMomentum = gpu->psLinearMomentum->_pDevStream[0];
+    for (int i = 0; i < (int) gpu->sim.blocks; i++)
+    {
+        (*gpu->psLinearMomentum)[i].x = 0.0f;
+        (*gpu->psLinearMomentum)[i].y = 0.0f;
+        (*gpu->psLinearMomentum)[i].z = 0.0f;
+        (*gpu->psLinearMomentum)[i].w = 0.0f;
+    }
+    gpu->psLinearMomentum->Upload();
+    return 1;
+}
+extern "C"
+void gpuSetPositions(gpuContext gpu, const vector<float>& x, const vector<float>& y, const vector<float>& z)
+{
+    for (int i = 0; i < gpu->natoms; i++)
+    {
+        (*gpu->psPosq4)[i].x = x[i];
+        (*gpu->psPosq4)[i].y = y[i];
+        (*gpu->psPosq4)[i].z = z[i];
+    }
+    gpu->psPosq4->Upload();
+	 // set flag to recalculate Born radii
+	 gpu->bRecalculateBornRadii = true;
+} 
+extern "C"
+void gpuSetVelocities(gpuContext gpu, const vector<float>& x, const vector<float>& y, const vector<float>& z)
+{
+    for (int i = 0; i < gpu->natoms; i++)
+    {
+        (*gpu->psVelm4)[i].x = x[i];
+        (*gpu->psVelm4)[i].y = y[i];
+        (*gpu->psVelm4)[i].z = z[i];
+    }
+    gpu->psVelm4->Upload();
+} 
+extern "C"
+void gpuSetMass(gpuContext gpu, const vector<float>& mass)
+{
+    float totalMass = 0.0f;
+    for (int i = 0; i < gpu->natoms; i++)
+    {
+        (*gpu->psVelm4)[i].w = 1.0f/mass[i];
+        totalMass += mass[i];
+    }
+    gpu->sim.inverseTotalMass = 1.0f / totalMass;
+    gpu->psVelm4->Upload();
+} 
+extern "C"
+void gpuInitializeRandoms(gpuContext gpu)
+{
+    for (int i = 0; i < (int) gpu->sim.blocks; i++)
+    {
+        (*gpu->psRandomPosition)[i] = 0;
+    }
+    int seed = gpu->seed | ((gpu->seed ^ 0xffffffff) << 16);
+#if 0
+    srand(seed);
+    for (int i = 0; i < (int) (gpu->sim.blocks * gpu->sim.random_threads_per_block); i++)
+    {
+        (*gpu->psRandomSeed)[i].x = rand();
+        (*gpu->psRandomSeed)[i].y = rand();
+        (*gpu->psRandomSeed)[i].z = rand();
+        (*gpu->psRandomSeed)[i].w = rand();
+    }
+#else
+    RNG rng(seed);
+    for (int i = 0; i < (int) (gpu->sim.blocks * gpu->sim.random_threads_per_block); i++)
+    {
+        (*gpu->psRandomSeed)[i].x = rng.rand_int();
+        (*gpu->psRandomSeed)[i].y = rng.rand_int();
+        (*gpu->psRandomSeed)[i].z = rng.rand_int();
+        (*gpu->psRandomSeed)[i].w = rng.rand_int();
+    }
+#endif
+    gpu->psRandomPosition->Upload();
+    gpu->psRandomSeed->Upload();
+    gpuSetConstants(gpu);
+    kGenerateRandoms(gpu);
+    return;
+}
+extern "C"
+bool gpuIsAvailable()
+{
+    int deviceCount;
+    cudaGetDeviceCount(&deviceCount);
+    return (deviceCount > 0);
+}
+extern "C"
+void* gpuInit(int numAtoms, unsigned int device, bool useBlockingSync)
+{
+    gpuContext gpu = new _gpuContext;
+    int LRFSize = 0;
+    int SMCount = 0;
+    int SMMajor = 0;
+    int SMMinor = 0;
+    // Select which device to use
+    int currentDevice;
+    cudaError_t status = cudaGetDevice(&currentDevice);
+    RTERROR(status, "Error getting CUDA device")
+    if (device != currentDevice)
+        cudaSetDevice(device); // Ignore errors
+    status = cudaGetDevice(&gpu->device);
+    RTERROR(status, "Error getting CUDA device")
+    status = cudaSetDeviceFlags(cudaDeviceMapHost+(useBlockingSync ? cudaDeviceBlockingSync : cudaDeviceScheduleAuto));
+    RTERROR(status, "Error setting device flags")
+    gpu->useBlockingSync = useBlockingSync;
+    // Determine kernel call configuration
+    cudaDeviceProp deviceProp;
+    cudaGetDeviceProperties(&deviceProp, currentDevice);
+    // Determine SM version
+    if (deviceProp.major == 1)
+    {
+        switch (deviceProp.minor)
+        {
+        case 0:
+        case 1:
+            gpu->sm_version = SM_10;
+            gpu->sim.workUnitsPerSM = G8X_NONBOND_WORKUNITS_PER_SM;
+            break;
+        default:
+            gpu->sm_version = SM_12;
+            gpu->sim.workUnitsPerSM = GT2XX_NONBOND_WORKUNITS_PER_SM;
+            break;
+        }
+    } 
+    else
+    {    
+        gpu->sm_version = SM_20;
+        gpu->sim.workUnitsPerSM = GF1XX_NONBOND_WORKUNITS_PER_SM;
+    }
+    if (deviceProp.regsPerBlock == 8192)
+    {
+        gpu->sim.nonbond_threads_per_block          = G8X_NONBOND_THREADS_PER_BLOCK;
+        gpu->sim.bornForce2_threads_per_block       = G8X_BORNFORCE2_THREADS_PER_BLOCK;
+        gpu->sim.max_shake_threads_per_block        = G8X_SHAKE_THREADS_PER_BLOCK;
+        gpu->sim.max_update_threads_per_block       = G8X_UPDATE_THREADS_PER_BLOCK;
+        gpu->sim.max_localForces_threads_per_block  = G8X_LOCALFORCES_THREADS_PER_BLOCK;
+        gpu->sim.threads_per_block                  = G8X_THREADS_PER_BLOCK;
+        gpu->sim.random_threads_per_block           = G8X_RANDOM_THREADS_PER_BLOCK;
+        gpu->blocksPerSM                            = G8X_BLOCKS_PER_SM;
+    }
+    else if (deviceProp.regsPerBlock <= 16384)
+    {
+        gpu->sim.nonbond_threads_per_block          = GT2XX_NONBOND_THREADS_PER_BLOCK;
+        gpu->sim.bornForce2_threads_per_block       = GT2XX_BORNFORCE2_THREADS_PER_BLOCK;
+        gpu->sim.max_shake_threads_per_block        = GT2XX_SHAKE_THREADS_PER_BLOCK;
+        gpu->sim.max_update_threads_per_block       = GT2XX_UPDATE_THREADS_PER_BLOCK;
+        gpu->sim.max_localForces_threads_per_block  = GT2XX_LOCALFORCES_THREADS_PER_BLOCK;
+        gpu->sim.threads_per_block                  = GT2XX_THREADS_PER_BLOCK;
+        gpu->sim.random_threads_per_block           = GT2XX_RANDOM_THREADS_PER_BLOCK;
+        gpu->blocksPerSM                            = GT2XX_BLOCKS_PER_SM;
+    }
+    else
+    {
+        gpu->sim.nonbond_threads_per_block          = GF1XX_NONBOND_THREADS_PER_BLOCK;
+        gpu->sim.bornForce2_threads_per_block       = GF1XX_BORNFORCE2_THREADS_PER_BLOCK;
+        gpu->sim.max_shake_threads_per_block        = GF1XX_SHAKE_THREADS_PER_BLOCK;
+        gpu->sim.max_update_threads_per_block       = GF1XX_UPDATE_THREADS_PER_BLOCK;
+        gpu->sim.max_localForces_threads_per_block  = GF1XX_LOCALFORCES_THREADS_PER_BLOCK;
+        gpu->sim.threads_per_block                  = GF1XX_THREADS_PER_BLOCK;
+        gpu->sim.random_threads_per_block           = GF1XX_RANDOM_THREADS_PER_BLOCK;
+        gpu->blocksPerSM                            = GF1XX_BLOCKS_PER_SM;
+    }
+    gpu->sim.nonbond_blocks = deviceProp.multiProcessorCount*gpu->blocksPerSM;
+    gpu->sim.bornForce2_blocks = deviceProp.multiProcessorCount*gpu->blocksPerSM;
+    gpu->sim.blocks = deviceProp.multiProcessorCount;
+    gpu->sharedMemoryPerBlock = deviceProp.sharedMemPerBlock;
+    gpu->sim.shake_threads_per_block                = gpu->sim.max_shake_threads_per_block;
+    gpu->sim.localForces_threads_per_block          = gpu->sim.max_localForces_threads_per_block;
+    gpu->natoms = numAtoms;
+    gpuAllocateInitialBuffers(gpu);
+    gpu->iterations = 0;
+    gpu->sim.update_threads_per_block               = (gpu->natoms + gpu->sim.blocks - 1) / gpu->sim.blocks;
+    if (gpu->sim.update_threads_per_block > gpu->sim.max_update_threads_per_block)
+        gpu->sim.update_threads_per_block = gpu->sim.max_update_threads_per_block;
+    if (gpu->sim.update_threads_per_block < gpu->psLangevinParameters->_length)
+            gpu->sim.update_threads_per_block = gpu->psLangevinParameters->_length;
+    gpu->sim.bf_reduce_threads_per_block = gpu->sim.update_threads_per_block;
+    gpu->sim.bsf_reduce_threads_per_block = (gpu->sim.stride4 + gpu->natoms + gpu->sim.blocks - 1) / gpu->sim.blocks;
+    gpu->sim.bsf_reduce_threads_per_block = ((gpu->sim.bsf_reduce_threads_per_block + (GRID - 1)) / GRID) * GRID;
+    if (gpu->sim.bsf_reduce_threads_per_block > gpu->sim.threads_per_block)
+        gpu->sim.bsf_reduce_threads_per_block = gpu->sim.threads_per_block;
+    if (gpu->sim.bsf_reduce_threads_per_block < 1)
+        gpu->sim.bsf_reduce_threads_per_block = 1;
+    // Initialize constants to reasonable values
+    gpu->sim.probeRadius            = probeRadius;
+    gpu->sim.surfaceAreaFactor      = surfaceAreaFactor;
+    gpu->sim.electricConstant       = electricConstant;
+    gpu->sim.nonbondedMethod        = NO_CUTOFF;
+    gpu->sim.nonbondedCutoff        = 0.0f;
+    gpu->sim.nonbondedCutoffSqr     = 0.0f;
+    gpu->sim.bigFloat               = 99999999.0f;
+    gpu->sim.forceConversionFactor  = forceConversionFactor;
+    gpu->sim.preFactor              = 2.0f*electricConstant*((1.0f/defaultInnerDielectric)-(1.0f/defaultSolventDielectric))*gpu->sim.forceConversionFactor;
+    gpu->sim.dielectricOffset       = dielectricOffset;
+    gpu->sim.alphaOBC               = alphaOBC;
+    gpu->sim.betaOBC                = betaOBC;
+    gpu->sim.gammaOBC               = gammaOBC;
+    gpu->sim.maxShakeIterations     = 15;
+    gpu->sim.shakeTolerance         = 1.0e-04f * 2.0f;
+    gpu->sim.InvMassJ               = 9.920635e-001f;
+    gpu->grid                       = GRID;
+    gpu->bCalculateCM               = false;
+    gpu->bRemoveCM                  = false;
+    gpu->bRecalculateBornRadii      = true;
+    gpu->bIncludeGBSA               = false;
+    gpu->bIncludeGBVI               = false;
+    gpuInitializeRandoms(gpu);
+    // To be determined later
+    gpu->psLJ14ID                   = NULL;
+    gpu->psForce4                   = NULL;
+    gpu->psEnergy                   = NULL;
+    gpu->sim.pForce4                = NULL;
+    gpu->psBornForce                = NULL;
+    gpu->sim.pBornForce             = NULL;
+    gpu->psBornSum                  = NULL;
+    gpu->sim.pBornSum               = NULL;
+    gpu->psBondID                   = NULL;
+    gpu->psBondParameter            = NULL;
+    gpu->psBondAngleID1             = NULL;
+    gpu->psBondAngleID2             = NULL;
+    gpu->psBondAngleParameter       = NULL;
+    gpu->psDihedralID1              = NULL;
+    gpu->psDihedralID2              = NULL;
+    gpu->psDihedralParameter        = NULL;
+    gpu->psRbDihedralID1            = NULL;
+    gpu->psRbDihedralID2            = NULL;
+    gpu->psRbDihedralParameter1     = NULL;
+    gpu->psRbDihedralParameter2     = NULL;
+    gpu->psLJ14ID                   = NULL;
+    gpu->psLJ14Parameter            = NULL;
+    gpu->psCustomParams             = NULL;
+    gpu->psCustomBondID             = NULL;
+    gpu->psCustomBondParams         = NULL;
+    gpu->psCustomAngleID1           = NULL;
+    gpu->psCustomAngleID2           = NULL;
+    gpu->psCustomAngleParams        = NULL;
+    gpu->psCustomTorsionID1         = NULL;
+    gpu->psCustomTorsionID2         = NULL;
+    gpu->psCustomTorsionParams      = NULL;
+    gpu->psCustomExternalID         = NULL;
+    gpu->psCustomExternalParams     = NULL;
+    gpu->psEwaldCosSinSum           = NULL;
+    gpu->psTabulatedErfc            = NULL;
+    gpu->psPmeGrid                  = NULL;
+    gpu->psPmeBsplineModuli[0]      = NULL;
+    gpu->psPmeBsplineModuli[1]      = NULL;
+    gpu->psPmeBsplineModuli[2]      = NULL;
+    gpu->psPmeBsplineTheta          = NULL;
+    gpu->psPmeBsplineDtheta         = NULL;
+    gpu->psPmeAtomRange             = NULL;
+    gpu->psPmeAtomGridIndex         = NULL;
+    gpu->psShakeID                  = NULL;
+    gpu->psShakeParameter           = NULL;
+    gpu->psSettleID                 = NULL;
+    gpu->psSettleParameter          = NULL;
+    gpu->psExclusion                = NULL;
+    gpu->psExclusionIndex           = NULL;
+    gpu->psWorkUnit                 = NULL;
+    gpu->psInteractingWorkUnit      = NULL;
+    gpu->psInteractionFlag          = NULL;
+    gpu->psInteractionCount         = NULL;
+    gpu->psGridBoundingBox          = NULL;
+    gpu->psGridCenter               = NULL;
+    gpu->psCcmaAtoms                = NULL;
+    gpu->psCcmaDistance             = NULL;
+    gpu->psCcmaAtomConstraints      = NULL;
+    gpu->psCcmaNumAtomConstraints   = NULL;
+    gpu->psCcmaDelta1               = NULL;
+    gpu->psCcmaDelta2               = NULL;
+    gpu->psCcmaReducedMass          = NULL;
+    gpu->psConstraintMatrixColumn   = NULL;
+    gpu->psConstraintMatrixValue    = NULL;
+    gpu->psTabulatedFunctionParams  = NULL;
+    for (int i = 0; i < MAX_TABULATED_FUNCTIONS; i++)
+        gpu->tabulatedFunctions[i].coefficients = NULL;
+    gpu->sim.customExpressionStackSize = 0;
+    gpu->sim.customBonds = 0;
+    gpu->sim.customAngles = 0;
+    gpu->sim.customTorsions = 0;
+    // Initialize output buffer before reading parameters
+    gpu->pOutputBufferCounter       = new unsigned int[gpu->sim.paddedNumberOfAtoms];
+    memset(gpu->pOutputBufferCounter, 0, gpu->sim.paddedNumberOfAtoms * sizeof(unsigned int));
+    return (void*)gpu;
+}
+extern "C"
+void gpuSetLangevinIntegrationParameters(gpuContext gpu, float tau, float deltaT, float temperature, float errorTol) {
+    gpu->sim.deltaT                 = deltaT;
+    gpu->sim.oneOverDeltaT          = 1.0f/deltaT;
+    gpu->sim.errorTol               = errorTol;
+    gpu->sim.tau                    = tau;
+    gpu->sim.T                      = temperature;
+    gpu->sim.kT                     = BOLTZ * gpu->sim.T;
+    double vscale = exp(-deltaT/tau);
+    double fscale = (1-vscale)*tau;
+    double noisescale = sqrt(2*gpu->sim.kT/tau)*sqrt(0.5*(1-vscale*vscale)*tau);
+    (*gpu->psLangevinParameters)[0] = (float) vscale;
+    (*gpu->psLangevinParameters)[1] = (float) fscale;
+    (*gpu->psLangevinParameters)[2] = (float) noisescale;
+    gpu->psLangevinParameters->Upload();
+    gpu->psStepSize->Download();
+    if ((*gpu->psStepSize)[0].x == 0)
+        (*gpu->psStepSize)[0].x = deltaT;
+    (*gpu->psStepSize)[0].y = deltaT;
+    gpu->psStepSize->Upload();
+}
+extern "C"
+void gpuSetVerletIntegrationParameters(gpuContext gpu, float deltaT, float errorTol) {
+    gpu->sim.deltaT                 = deltaT;
+    gpu->sim.oneOverDeltaT          = 1.0f/deltaT;
+    gpu->sim.errorTol               = errorTol;
+    gpu->psStepSize->Download();
+    if ((*gpu->psStepSize)[0].x == 0)
+        (*gpu->psStepSize)[0].x = deltaT;
+    (*gpu->psStepSize)[0].y = deltaT;
+    gpu->psStepSize->Upload();
+}
+extern "C"
+void gpuSetBrownianIntegrationParameters(gpuContext gpu, float tau, float deltaT, float temperature) {
+    gpu->sim.deltaT                 = deltaT;
+    gpu->sim.oneOverDeltaT          = 1.0f/deltaT;
+    gpu->sim.tau                    = tau;
+    gpu->sim.tauDeltaT              = gpu->sim.deltaT * gpu->sim.tau;
+    gpu->sim.T                      = temperature;
+    gpu->sim.kT                     = BOLTZ * gpu->sim.T;
+    gpu->sim.noiseAmplitude         = sqrt(2.0f*gpu->sim.kT*deltaT*tau);
+    gpu->psStepSize->Download();
+    if ((*gpu->psStepSize)[0].x == 0)
+        (*gpu->psStepSize)[0].x = deltaT;
+    (*gpu->psStepSize)[0].y = deltaT;
+    gpu->psStepSize->Upload();
+}
+extern "C"
+void gpuSetAndersenThermostatParameters(gpuContext gpu, float temperature, float collisionFrequency) {
+    gpu->sim.T                      = temperature;
+    gpu->sim.kT                     = BOLTZ * gpu->sim.T;
+    gpu->sim.collisionFrequency     = collisionFrequency;
+}
+extern "C"
+void gpuShutDown(gpuContext gpu)
+{
+    // Delete sysmem pointers
+    delete[] gpu->pOutputBufferCounter;
+    delete[] gpu->gpAtomTable;
+    delete[] gpu->pAtomSymbol;
+    // Delete device pointers
+    delete gpu->psPosq4;
+    delete gpu->psPosqP4;
+    delete gpu->psOldPosq4;
+    delete gpu->psVelm4;
+    delete gpu->psForce4;
+    delete gpu->psEnergy;
+    delete gpu->psSigEps2;
+    if (gpu->psCustomParams != NULL)
+        delete gpu->psCustomParams;
+    if (gpu->psCustomBondParams != NULL) {
+        delete gpu->psCustomBondID;
+        delete gpu->psCustomBondParams;
+    }
+    if (gpu->psCustomAngleParams != NULL) {
+        delete gpu->psCustomAngleID1;
+        delete gpu->psCustomAngleID2;
+        delete gpu->psCustomAngleParams;
+    }
+    if (gpu->psCustomTorsionParams != NULL) {
+        delete gpu->psCustomTorsionID1;
+        delete gpu->psCustomTorsionID2;
+        delete gpu->psCustomTorsionParams;
+    }
+    if (gpu->psCustomExternalParams != NULL) {
+        delete gpu->psCustomExternalID;
+        delete gpu->psCustomExternalParams;
+    }
+    if (gpu->psEwaldCosSinSum != NULL)
+        delete gpu->psEwaldCosSinSum;
+    if (gpu->psPmeGrid != NULL) {
+        delete gpu->psPmeGrid;
+        delete gpu->psPmeBsplineModuli[0];
+        delete gpu->psPmeBsplineModuli[1];
+        delete gpu->psPmeBsplineModuli[2];
+        delete gpu->psPmeBsplineTheta;
+        delete gpu->psPmeBsplineDtheta;
+        delete gpu->psPmeAtomRange;
+        delete gpu->psPmeAtomGridIndex;
+        cufftDestroy(gpu->fftplan);
+    }
+    if (gpu->psTabulatedErfc != NULL)
+        delete gpu->psTabulatedErfc;
+    delete gpu->psObcData;
+    delete gpu->psGBVIData;
+    delete gpu->psGBVISwitchDerivative;
+    delete gpu->psObcChain;
+    delete gpu->psBornForce;
+    delete gpu->psBornRadii;
+    delete gpu->psBornSum;
+    delete gpu->psBondID;
+    delete gpu->psBondParameter;
+    delete gpu->psBondAngleID1;
+    delete gpu->psBondAngleID2;
+    delete gpu->psBondAngleParameter;
+    delete gpu->psDihedralID1;
+    delete gpu->psDihedralID2;
+    delete gpu->psDihedralParameter;
+    delete gpu->psRbDihedralID1;
+    delete gpu->psRbDihedralID2;
+    delete gpu->psRbDihedralParameter1;
+    delete gpu->psRbDihedralParameter2;
+    delete gpu->psLJ14ID;
+    delete gpu->psLJ14Parameter;
+    delete gpu->psShakeID;
+    delete gpu->psShakeParameter;
+    delete gpu->psSettleID;
+    delete gpu->psSettleParameter;
+    delete gpu->psExclusion;
+    delete gpu->psExclusionIndex;
+    delete gpu->psWorkUnit;
+    delete gpu->psInteractingWorkUnit;
+    delete gpu->psInteractionFlag;
+    delete gpu->psInteractionCount;
+    delete gpu->psStepSize;
+    delete gpu->psLangevinParameters;
+    delete gpu->psRandom4;
+    delete gpu->psRandom2;
+    delete gpu->psRandomPosition;    
+    delete gpu->psRandomSeed;
+    delete gpu->psLinearMomentum;
+    delete gpu->psAtomIndex;
+    delete gpu->psGridBoundingBox;
+    delete gpu->psGridCenter;
+    delete gpu->psCcmaAtoms;
+    delete gpu->psCcmaDistance;
+    delete gpu->psCcmaAtomConstraints;
+    delete gpu->psCcmaNumAtomConstraints;
+    delete gpu->psCcmaDelta1;
+    delete gpu->psCcmaDelta2;
+    delete gpu->psCcmaReducedMass;
+    cudaEventDestroy(gpu->ccmaEvent);
+    delete gpu->psConstraintMatrixColumn;
+    delete gpu->psConstraintMatrixValue;
+    delete gpu->psTabulatedFunctionParams;
+    for (int i = 0; i < MAX_TABULATED_FUNCTIONS; i++)
+        if (gpu->tabulatedFunctions[i].coefficients != NULL)
+            delete gpu->tabulatedFunctions[i].coefficients;
+    if (gpu->compactPlan.valid)
+        destroyCompactionPlan(gpu->compactPlan);
+    // Wrap up
+    delete gpu;
+    cudaThreadExit();
+    return;
+}
+extern "C"
+int gpuBuildOutputBuffers(gpuContext gpu)
+{
+    // Select the number of output buffer to use.
+    gpu->bOutputBufferPerWarp           = true;
+    gpu->sim.nonbondOutputBuffers       = gpu->sim.nonbond_blocks * gpu->sim.nonbond_threads_per_block / GRID;
+    if (gpu->sim.nonbondOutputBuffers >= gpu->sim.paddedNumberOfAtoms/GRID)
+    {
+        // For small systems, it is more efficient to have one output buffer per block of 32 atoms instead of one per warp.
+        gpu->bOutputBufferPerWarp           = false;
+        gpu->sim.nonbondOutputBuffers       = gpu->sim.paddedNumberOfAtoms / GRID;
+    }
+    if (gpu->sim.nonbondOutputBuffers > gpu->sim.outputBuffers)
+        gpu->sim.outputBuffers = gpu->sim.nonbondOutputBuffers;
+    unsigned int outputBuffers = gpu->sim.outputBuffers;
+    for (unsigned int i = 0; i < gpu->sim.paddedNumberOfAtoms; i++)
+    {
+        if (outputBuffers < gpu->pOutputBufferCounter[i])
+        {
+            outputBuffers = gpu->pOutputBufferCounter[i];
+        }
+    }    
+    gpu->sim.outputBuffers      = outputBuffers;
+    gpu->sim.energyOutputBuffers = max(gpu->sim.nonbond_threads_per_block, gpu->sim.localForces_threads_per_block)*gpu->sim.blocks;
+    gpu->psForce4               = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, outputBuffers, "Force");
+    gpu->psEnergy               = new CUDAStream<float>(gpu->sim.energyOutputBuffers, 1, "Energy");
+    gpu->psBornForce            = new CUDAStream<float>(gpu->sim.paddedNumberOfAtoms, gpu->sim.nonbondOutputBuffers, "BornForce");
+    gpu->psBornSum              = new CUDAStream<float>(gpu->sim.paddedNumberOfAtoms, gpu->sim.nonbondOutputBuffers, "BornSum");
+    gpu->sim.pForce4            = gpu->psForce4->_pDevStream[0];
+    gpu->sim.pEnergy            = gpu->psEnergy->_pDevStream[0];
+    gpu->sim.pBornForce         = gpu->psBornForce->_pDevStream[0];
+    gpu->sim.pBornSum           = gpu->psBornSum->_pDevStream[0];
+    // Determine local energy paramter offsets for bonded interactions
+    gpu->sim.bond_offset        =                                  gpu->psBondParameter->_stride;
+    gpu->sim.bond_angle_offset  = gpu->sim.bond_offset           + gpu->psBondAngleParameter->_stride;
+    gpu->sim.dihedral_offset    = gpu->sim.bond_angle_offset     + gpu->psDihedralParameter->_stride;
+    gpu->sim.rb_dihedral_offset = gpu->sim.dihedral_offset       + gpu->psRbDihedralParameter1->_stride;
+    gpu->sim.LJ14_offset        = gpu->sim.rb_dihedral_offset    + gpu->psLJ14Parameter->_stride;
+    gpu->sim.localForces_threads_per_block  = (max(gpu->sim.LJ14_offset, gpu->sim.customBonds) / gpu->sim.blocks + 15) & 0xfffffff0;
+    if (gpu->sim.localForces_threads_per_block > gpu->sim.max_localForces_threads_per_block)
+        gpu->sim.localForces_threads_per_block = gpu->sim.max_localForces_threads_per_block;
+    if (gpu->sim.localForces_threads_per_block < 1)
+        gpu->sim.localForces_threads_per_block = 1;
+    // Flip local force output buffers
+    int flip = outputBuffers - 1;
+    for (int i = 0; i < (int) gpu->sim.bonds; i++)
+    {
+        (*gpu->psBondID)[i].z = flip - (*gpu->psBondID)[i].z;
+        (*gpu->psBondID)[i].w = flip - (*gpu->psBondID)[i].w;
+    }
+    for (int i = 0; i < (int) gpu->sim.bond_angles; i++)
+    {
+        (*gpu->psBondAngleID1)[i].w = flip - (*gpu->psBondAngleID1)[i].w;
+        (*gpu->psBondAngleID2)[i].x = flip - (*gpu->psBondAngleID2)[i].x;
+        (*gpu->psBondAngleID2)[i].y = flip - (*gpu->psBondAngleID2)[i].y;
+    }
+    for (int i = 0; i < (int) gpu->sim.dihedrals; i++)
+    {
+        (*gpu->psDihedralID2)[i].x = flip - (*gpu->psDihedralID2)[i].x;
+        (*gpu->psDihedralID2)[i].y = flip - (*gpu->psDihedralID2)[i].y;
+        (*gpu->psDihedralID2)[i].z = flip - (*gpu->psDihedralID2)[i].z;
+        (*gpu->psDihedralID2)[i].w = flip - (*gpu->psDihedralID2)[i].w;
+    }
+    for (int i = 0; i < (int) gpu->sim.rb_dihedrals; i++)
+    {
+        (*gpu->psRbDihedralID2)[i].x = flip - (*gpu->psRbDihedralID2)[i].x;
+        (*gpu->psRbDihedralID2)[i].y = flip - (*gpu->psRbDihedralID2)[i].y;
+        (*gpu->psRbDihedralID2)[i].z = flip - (*gpu->psRbDihedralID2)[i].z;
+        (*gpu->psRbDihedralID2)[i].w = flip - (*gpu->psRbDihedralID2)[i].w;
+    }
+    for (int i = 0; i < (int) gpu->sim.LJ14s; i++)
+    {
+        (*gpu->psLJ14ID)[i].z = flip - (*gpu->psLJ14ID)[i].z;
+        (*gpu->psLJ14ID)[i].w = flip - (*gpu->psLJ14ID)[i].w;
+    }
+    gpu->psBondID->Upload();
+    gpu->psBondAngleID1->Upload();
+    gpu->psBondAngleID2->Upload();
+    gpu->psDihedralID2->Upload();
+    gpu->psRbDihedralID2->Upload();
+    gpu->psLJ14ID->Upload();
+    return 1;
+}
+extern "C"
+int gpuBuildThreadBlockWorkList(gpuContext gpu)
+{
+    const unsigned int atoms = gpu->sim.paddedNumberOfAtoms;
+    const unsigned int grid = gpu->grid;
+    const unsigned int dim = (atoms + (grid - 1)) / grid;
+    const unsigned int cells = dim * (dim + 1) / 2;
+    CUDAStream<unsigned int>* psWorkUnit = new CUDAStream<unsigned int>(cells, 1u, "WorkUnit");
+    unsigned int* pWorkList = psWorkUnit->_pSysData;
+    gpu->psWorkUnit = psWorkUnit;
+    gpu->sim.pWorkUnit = psWorkUnit->_pDevStream[0];
+    CUDAStream<unsigned int>* psInteractingWorkUnit = new CUDAStream<unsigned int>(cells, 1u, "InteractingWorkUnit");
+    gpu->psInteractingWorkUnit = psInteractingWorkUnit;
+    gpu->sim.pInteractingWorkUnit = psInteractingWorkUnit->_pDevStream[0];
+    CUDAStream<unsigned int>* psInteractionFlag = new CUDAStream<unsigned int>(cells, 1u, "InteractionFlag");
+    gpu->psInteractionFlag = psInteractionFlag;
+    gpu->sim.pInteractionFlag = psInteractionFlag->_pDevStream[0];
+    CUDAStream<size_t>* psInteractionCount = new CUDAStream<size_t>(1, 1u, "InteractionCount");
+    gpu->psInteractionCount = psInteractionCount;
+    gpu->sim.pInteractionCount = psInteractionCount->_pDevStream[0];
+    CUDAStream<float4>* psGridBoundingBox = new CUDAStream<float4>(dim, 1u, "GridBoundingBox");
+    gpu->psGridBoundingBox = psGridBoundingBox;
+    gpu->sim.pGridBoundingBox = psGridBoundingBox->_pDevStream[0];
+    CUDAStream<float4>* psGridCenter = new CUDAStream<float4>(dim, 1u, "GridCenter");
+    gpu->psGridCenter = psGridCenter;
+    gpu->sim.pGridCenter = psGridCenter->_pDevStream[0];
+    gpu->sim.nonbond_workBlock      = gpu->sim.nonbond_threads_per_block / GRID;
+    gpu->sim.bornForce2_workBlock   = gpu->sim.bornForce2_threads_per_block / GRID;
+    gpu->sim.workUnits = cells;
+    // Initialize the plan for doing stream compaction.
+    planCompaction(gpu->compactPlan);
+    // Increase block count if necessary for extra large molecules that would
+    // otherwise overflow the SM workunit buffers
+//    int minimumBlocks = (cells + gpu->sim.workUnitsPerSM - 1) / gpu->sim.workUnitsPerSM;
+//    if ((int) gpu->sim.nonbond_blocks < minimumBlocks)
+//    {
+//        gpu->sim.nonbond_blocks = gpu->sim.nonbond_blocks * ((minimumBlocks + gpu->sim.nonbond_blocks - 1) / gpu->sim.nonbond_blocks);
+//    }
+//    if ((int) gpu->sim.bornForce2_blocks < minimumBlocks)
+//    {
+//        gpu->sim.bornForce2_blocks = gpu->sim.bornForce2_blocks * ((minimumBlocks + gpu->sim.bornForce2_blocks - 1) / gpu->sim.bornForce2_blocks);
+//    }
+    gpu->sim.nbWorkUnitsPerBlock            = cells / gpu->sim.nonbond_blocks;
+    gpu->sim.nbWorkUnitsPerBlockRemainder   = cells - gpu->sim.nonbond_blocks * gpu->sim.nbWorkUnitsPerBlock;
+    gpu->sim.bf2WorkUnitsPerBlock           = cells / gpu->sim.bornForce2_blocks;
+    gpu->sim.bf2WorkUnitsPerBlockRemainder  = cells - gpu->sim.bornForce2_blocks * gpu->sim.bf2WorkUnitsPerBlock;
+    gpu->sim.interaction_threads_per_block = 64;
+    gpu->sim.interaction_blocks = (gpu->sim.workUnits + gpu->sim.interaction_threads_per_block - 1) / gpu->sim.interaction_threads_per_block;
+    if (gpu->sim.interaction_blocks > 8*gpu->sim.blocks)
+        gpu->sim.interaction_blocks = 8*gpu->sim.blocks;
+    // Decrease thread count for extra small molecules to spread computation
+    // across entire chip
+    int activeWorkUnits = gpu->sim.nonbond_blocks * gpu->sim.nonbond_workBlock;
+    if (activeWorkUnits > (int) cells)
+    {
+        int balancedWorkBlock                   = (cells + gpu->sim.nonbond_blocks - 1) / gpu->sim.nonbond_blocks;
+        gpu->sim.nonbond_threads_per_block      = balancedWorkBlock * GRID;
+        gpu->sim.nonbond_workBlock              = balancedWorkBlock;
+    }
+    activeWorkUnits = gpu->sim.bornForce2_blocks * gpu->sim.bornForce2_workBlock;
+    if (activeWorkUnits > (int) cells)
+    {
+        int balancedWorkBlock                   = (cells + gpu->sim.bornForce2_blocks - 1) / gpu->sim.bornForce2_blocks;
+        gpu->sim.bornForce2_threads_per_block   = balancedWorkBlock * GRID;
+        gpu->sim.bornForce2_workBlock           = balancedWorkBlock;
+    }
+    unsigned int count = 0;
+    for (unsigned int y = 0; y < dim; y++)
+    {
+        for (unsigned int x = y; x < dim; x++)
+        {
+            pWorkList[count] = (x << 17) | (y << 2);
+            count++;
+        }
+    }
+    (*gpu->psInteractionCount)[0] = gpu->sim.workUnits;
+    gpu->psInteractionCount->Upload();
+    psWorkUnit->Upload();
+    gpuSetConstants(gpu);
+    return cells;
+}
+extern "C"
+void gpuBuildExclusionList(gpuContext gpu)
+{
+    const unsigned int atoms = gpu->sim.paddedNumberOfAtoms;
+    const unsigned int grid = gpu->grid;
+    const unsigned int dim = atoms/grid;
+    unsigned int* pWorkList = gpu->psWorkUnit->_pSysData;
+    // Mark which work units have exclusions.
+    for (int atom1 = 0; atom1 < (int)gpu->exclusions.size(); ++atom1)
+    {
+        int x = atom1/grid;
+        for (int j = 0; j < (int)gpu->exclusions[atom1].size(); ++j)
+        {
+            int atom2 = gpu->exclusions[atom1][j];
+            int y = atom2/grid;
+            int cell = (x > y ? x+y*dim-y*(y+1)/2 : y+x*dim-x*(x+1)/2);
+            pWorkList[cell] |= 1;
+        }
+    }
+    if ((int)gpu->sim.paddedNumberOfAtoms > gpu->natoms)
+    {
+        int lastBlock = gpu->natoms/grid;
+        for (int i = 0; i < (int)gpu->sim.workUnits; ++i)
+        {
+            int x = pWorkList[i]>>17;
+            int y = (pWorkList[i]>>2)&0x7FFF;
+            if (x == lastBlock || y == lastBlock)
+                pWorkList[i] |= 1;
+        }
+    }
+    // Build a list of indexes for the work units with exclusions.
+    CUDAStream<unsigned int>* psExclusionIndex = new CUDAStream<unsigned int>(gpu->sim.workUnits, 1u, "ExclusionIndex");
+    gpu->psExclusionIndex = psExclusionIndex;
+    unsigned int* pExclusionIndex = psExclusionIndex->_pSysData;
+    gpu->sim.pExclusionIndex = psExclusionIndex->_pDevData;
+    int numWithExclusions = 0;
+    for (int i = 0; i < (int)psExclusionIndex->_length; ++i)
+        if ((pWorkList[i]&1) == 1)
+            pExclusionIndex[i] = (numWithExclusions++)*grid;
+    // Record the exclusion data.
+    CUDAStream<unsigned int>* psExclusion = new CUDAStream<unsigned int>(numWithExclusions*grid, 1u, "Exclusion");
+    gpu->psExclusion = psExclusion;
+    unsigned int* pExclusion = psExclusion->_pSysData;
+    gpu->sim.pExclusion = psExclusion->_pDevData;
+    for (int i = 0; i < (int)psExclusion->_length; ++i)
+        pExclusion[i] = 0xFFFFFFFF;
+    for (int atom1 = 0; atom1 < (int)gpu->exclusions.size(); ++atom1)
+    {
+        int x = atom1/grid;
+        int offset1 = atom1-x*grid;
+        for (int j = 0; j < (int)gpu->exclusions[atom1].size(); ++j)
+        {
+            int atom2 = gpu->exclusions[atom1][j];
+            int y = atom2/grid;
+            int offset2 = atom2-y*grid;
+            if (x > y)
+            {
+                int cell = x+y*dim-y*(y+1)/2;
+                pExclusion[pExclusionIndex[cell]+offset1] &= 0xFFFFFFFF-(1<<offset2);
+            }
+            else
+            {
+                int cell = y+x*dim-x*(x+1)/2;
+                pExclusion[pExclusionIndex[cell]+offset2] &= 0xFFFFFFFF-(1<<offset1);
+            }
+        }
+    }
+    // Mark all interactions that involve a padding atom as being excluded.
+    for (int atom1 = gpu->natoms; atom1 < (int)atoms; ++atom1)
+    {
+        int x = atom1/grid;
+        int offset1 = atom1-x*grid;
+        for (int atom2 = 0; atom2 < (int)atoms; ++atom2)
+        {
+            int y = atom2/grid;
+            int offset2 = atom2-y*grid;
+            if (x >= y)
+            {
+                int cell = x+y*dim-y*(y+1)/2;
+                pExclusion[pExclusionIndex[cell]+offset1] &= 0xFFFFFFFF-(1<<offset2);
+            }
+            if (y >= x)
+            {
+                int cell = y+x*dim-x*(x+1)/2;
+                pExclusion[pExclusionIndex[cell]+offset2] &= 0xFFFFFFFF-(1<<offset1);
+            }
+        }
+    }
+    psExclusion->Upload();
+    psExclusionIndex->Upload();
+    gpu->psWorkUnit->Upload();
+    gpuSetConstants(gpu);
+}
+extern "C"
+int gpuSetConstants(gpuContext gpu)
+{
+    SetCalculateCDLJForcesSim(gpu);
+    SetCalculateCDLJObcGbsaForces1Sim(gpu);
+    SetCalculateCustomNonbondedForcesSim(gpu);
+    SetCalculateCustomBondForcesSim(gpu);
+    SetCalculateCustomAngleForcesSim(gpu);
+    SetCalculateCustomTorsionForcesSim(gpu);
+    SetCalculateCustomExternalForcesSim(gpu);
+    SetCalculateLocalForcesSim(gpu);
+    SetCalculateObcGbsaBornSumSim(gpu);
+    SetCalculateGBVIBornSumSim(gpu);
+    SetCalculateObcGbsaForces2Sim(gpu);
+    SetCalculateGBVIForces2Sim(gpu);
+    SetCalculateAndersenThermostatSim(gpu);
+    SetCalculatePMESim(gpu);
+    SetForcesSim(gpu);
+    SetShakeHSim(gpu);
+    SetLangevinUpdateSim(gpu);
+    SetVerletUpdateSim(gpu);
+    SetBrownianUpdateSim(gpu);
+    SetSettleSim(gpu);
+    SetCCMASim(gpu);
+    SetRandomSim(gpu);
+    return 1;
+}
+static void tagAtomsInMolecule(int atom, int molecule, vector<int>& atomMolecule, vector<vector<int> >& atomBonds)
+{
+    // Recursively tag atoms as belonging to a particular molecule.
+    atomMolecule[atom] = molecule;
+    for (int i = 0; i < (int)atomBonds[atom].size(); i++)
+        if (atomMolecule[atomBonds[atom][i]] == -1)
+            tagAtomsInMolecule(atomBonds[atom][i], molecule, atomMolecule, atomBonds);
+}
+static void findMoleculeGroups(gpuContext gpu)
+{
+    // First make a list of constraints for future use.
+    vector<Constraint> constraints;
+    for (int i = 0; i < (int)gpu->sim.ShakeConstraints; i++)
+    {
+        int atom1 = (*gpu->psShakeID)[i].x;
+        int atom2 = (*gpu->psShakeID)[i].y;
+        int atom3 = (*gpu->psShakeID)[i].z;
+        int atom4 = (*gpu->psShakeID)[i].w;
+        float distance2 = (*gpu->psShakeParameter)[i].z;
+        constraints.push_back(Constraint(atom1, atom2, distance2));
+        if (atom3 != -1)
+            constraints.push_back(Constraint(atom1, atom3, distance2));
+        if (atom4 != -1)
+            constraints.push_back(Constraint(atom1, atom4, distance2));
+    }
+    for (int i = 0; i < (int)gpu->sim.settleConstraints; i++)
+    {
+        int atom1 = (*gpu->psSettleID)[i].x;
+        int atom2 = (*gpu->psSettleID)[i].y;
+        int atom3 = (*gpu->psSettleID)[i].z;
+        float distance12 = (*gpu->psSettleParameter)[i].x;
+        float distance23 = (*gpu->psSettleParameter)[i].y;
+        constraints.push_back(Constraint(atom1, atom2, distance12*distance12));
+        constraints.push_back(Constraint(atom1, atom3, distance12*distance12));
+        constraints.push_back(Constraint(atom2, atom3, distance23*distance23));
+    }
+    for (int i = 0; i < (int)gpu->sim.ccmaConstraints; i++)
+    {
+        int atom1 = (*gpu->psCcmaAtoms)[i].x;
+        int atom2 = (*gpu->psCcmaAtoms)[i].y;
+        float distance2 = (*gpu->psCcmaDistance)[i].w;
+        constraints.push_back(Constraint(atom1, atom2, distance2));
+    }
+    // First make a list of every other atom to which each atom is connect by a bond, constraint, or exclusion.
+    int numAtoms = gpu->natoms;
+    vector<vector<int> > atomBonds(numAtoms);
+    for (int i = 0; i < (int) gpu->forces.size(); i++) {
+        for (int j = 0; j < gpu->forces[i]->getNumParticleGroups(); j++) {
+            vector<int> particles;
+            gpu->forces[i]->getParticlesInGroup(j, particles);
+            for (int k = 0; k < (int) particles.size(); k++)
+                for (int m = 0; m < (int) particles.size(); m++)
+                    if (k != m)
+                        atomBonds[particles[k]].push_back(particles[m]);
+        }
+    }
+    for (int i = 0; i < (int)constraints.size(); i++)
+    {
+        int atom1 = constraints[i].atom1;
+        int atom2 = constraints[i].atom2;
+        atomBonds[atom1].push_back(atom2);
+        atomBonds[atom2].push_back(atom1);
+    }
+    // Now tag atoms by which molecule they belong to.
+    vector<int> atomMolecule(numAtoms, -1);
+    int numMolecules = 0;
+    for (int i = 0; i < numAtoms; i++)
+        if (atomMolecule[i] == -1)
+            tagAtomsInMolecule(i, numMolecules++, atomMolecule, atomBonds);
+    vector<vector<int> > atomIndices(numMolecules);
+    for (int i = 0; i < numAtoms; i++)
+        atomIndices[atomMolecule[i]].push_back(i);
+    // Construct a description of each molecule.
+    vector<Molecule> molecules(numMolecules);
+    for (int i = 0; i < numMolecules; i++)
+    {
+        molecules[i].atoms = atomIndices[i];
+        molecules[i].groups.resize(gpu->forces.size());
+    }
+    for (int i = 0; i < (int) gpu->forces.size(); i++)
+        for (int j = 0; j < gpu->forces[i]->getNumParticleGroups(); j++)
+        {
+            vector<int> particles;
+            gpu->forces[i]->getParticlesInGroup(j, particles);
+            molecules[atomMolecule[particles[0]]].groups[i].push_back(j);
+        }
+    for (int i = 0; i < (int)constraints.size(); i++)
+    {
+        molecules[atomMolecule[constraints[i].atom1]].constraints.push_back(i);
+    }
+    // Sort them into groups of identical molecules.
+    vector<Molecule> uniqueMolecules;
+    vector<vector<int> > moleculeInstances;
+    for (int molIndex = 0; molIndex < (int)molecules.size(); molIndex++)
+    {
+        Molecule& mol = molecules[molIndex];
+        // See if it is identical to another molecule.
+        bool isNew = true;
+        for (int j = 0; j < (int)uniqueMolecules.size() && isNew; j++)
+        {
+            Molecule& mol2 = uniqueMolecules[j];
+            bool identical = (mol.atoms.size() == mol2.atoms.size() && mol.constraints.size() == mol2.constraints.size());
+            // See if the atoms are identical.
+            int atomOffset = mol2.atoms[0]-mol.atoms[0];
+            float4* velm = gpu->psVelm4->_pSysData;
+            for (int i = 0; i < (int) mol.atoms.size() && identical; i++) {
+                if (mol.atoms[i] != mol2.atoms[i]-atomOffset || velm[mol.atoms[i]].w != velm[mol2.atoms[i]].w)
+                    identical = false;
+                for (int k = 0; k < (int) gpu->forces.size(); k++)
+                    if (!gpu->forces[k]->areParticlesIdentical(mol.atoms[i], mol2.atoms[i]))
+                        identical = false;
+            }
+            // See if the constraints are identical.
+            for (int i = 0; i < (int) mol.constraints.size() && identical; i++)
+                if (constraints[mol.constraints[i]].atom1 != constraints[mol2.constraints[i]].atom1-atomOffset ||
+                        constraints[mol.constraints[i]].atom2 != constraints[mol2.constraints[i]].atom2-atomOffset ||
+                        constraints[mol.constraints[i]].distance2 != constraints[mol2.constraints[i]].distance2)
+                    identical = false;
+            // See if the force groups are identical.
+            for (int i = 0; i < (int) gpu->forces.size() && identical; i++)
+            {
+                if (mol.groups[i].size() != mol2.groups[i].size())
+                    identical = false;
+                for (int k = 0; k < (int) mol.groups[i].size() && identical; k++)
+                    if (!gpu->forces[i]->areGroupsIdentical(mol.groups[i][k], mol2.groups[i][k]))
+                        identical = false;
+            }
+            if (identical)
+            {
+                moleculeInstances[j].push_back(mol.atoms[0]);
+                isNew = false;
+            }
+        }
+        if (isNew)
+        {
+            uniqueMolecules.push_back(mol);
+            moleculeInstances.push_back(vector<int>());
+            moleculeInstances[moleculeInstances.size()-1].push_back(mol.atoms[0]);
+        }
+    }
+    gpu->moleculeGroups.resize(moleculeInstances.size());
+    for (int i = 0; i < (int)moleculeInstances.size(); i++)
+    {
+        gpu->moleculeGroups[i].instances = moleculeInstances[i];
+        vector<int>& atoms = uniqueMolecules[i].atoms;
+        gpu->moleculeGroups[i].atoms.resize(atoms.size());
+        for (int j = 0; j < (int)atoms.size(); j++)
+            gpu->moleculeGroups[i].atoms[j] = atoms[j]-atoms[0];
+    }
+}
+extern "C"
+void gpuReorderAtoms(gpuContext gpu)
+{
+    if (gpu->natoms == 0 || gpu->sim.nonbondedCutoffSqr == 0.0)
+        return;
+    if (gpu->moleculeGroups.size() == 0)
+        findMoleculeGroups(gpu);
+    // Find the range of positions and the number of bins along each axis.
+    int numAtoms = gpu->natoms;
+    gpu->psPosq4->Download();
+    gpu->psVelm4->Download();
+    float4* posq = gpu->psPosq4->_pSysData;
+    float4* velm = gpu->psVelm4->_pSysData;
+    float minx = posq[0].x, maxx = posq[0].x;
+    float miny = posq[0].y, maxy = posq[0].y;
+    float minz = posq[0].z, maxz = posq[0].z;
+    if (gpu->sim.nonbondedMethod == PERIODIC || gpu->sim.nonbondedMethod == EWALD || gpu->sim.nonbondedMethod == PARTICLE_MESH_EWALD)
+    {
+        minx = miny = minz = 0.0;
+        maxx = gpu->sim.periodicBoxSizeX;
+        maxy = gpu->sim.periodicBoxSizeY;
+        maxz = gpu->sim.periodicBoxSizeZ;
+    }
+    else
+    {
+        for (int i = 1; i < numAtoms; i++)
+        {
+            minx = min(minx, posq[i].x);
+            maxx = max(maxx, posq[i].x);
+            miny = min(miny, posq[i].y);
+            maxy = max(maxy, posq[i].y);
+            minz = min(minz, posq[i].z);
+            maxz = max(maxz, posq[i].z);
+        }
+    }
+    // Loop over each group of identical molecules and reorder them.
+    vector<int> originalIndex(numAtoms);
+    vector<float4> newPosq(numAtoms);
+    vector<float4> newVelm(numAtoms);
+    vector<int3> newCellOffsets(numAtoms);
+    for (int group = 0; group < (int)gpu->moleculeGroups.size(); group++)
+    {
+        // Find the center of each molecule.
+        gpuMoleculeGroup& mol = gpu->moleculeGroups[group];
+        int numMolecules = mol.instances.size();
+        vector<int>& atoms = mol.atoms;
+        vector<float3> molPos(numMolecules);
+        for (int i = 0; i < numMolecules; i++)
+        {
+            molPos[i].x = 0.0f;
+            molPos[i].y = 0.0f;
+            molPos[i].z = 0.0f;
+            for (int j = 0; j < (int)atoms.size(); j++)
+            {
+                int atom = atoms[j]+mol.instances[i];
+                molPos[i].x += posq[atom].x;
+                molPos[i].y += posq[atom].y;
+                molPos[i].z += posq[atom].z;
+            }
+            molPos[i].x /= atoms.size();
+            molPos[i].y /= atoms.size();
+            molPos[i].z /= atoms.size();
+        }
+        if (gpu->sim.nonbondedMethod == PERIODIC || gpu->sim.nonbondedMethod == EWALD || gpu->sim.nonbondedMethod == PARTICLE_MESH_EWALD)
+        {
+            // Move each molecule position into the same box.
+            for (int i = 0; i < numMolecules; i++)
+            {
+                int xcell = (int) floor(molPos[i].x/gpu->sim.periodicBoxSizeX);
+                int ycell = (int) floor(molPos[i].y/gpu->sim.periodicBoxSizeY);
+                int zcell = (int) floor(molPos[i].z/gpu->sim.periodicBoxSizeZ);
+                float dx = xcell*gpu->sim.periodicBoxSizeX;
+                float dy = ycell*gpu->sim.periodicBoxSizeY;
+                float dz = zcell*gpu->sim.periodicBoxSizeZ;
+                if (dx != 0.0f || dy != 0.0f || dz != 0.0f)
+                {
+                    molPos[i].x -= dx;
+                    molPos[i].y -= dy;
+                    molPos[i].z -= dz;
+                    for (int j = 0; j < (int)atoms.size(); j++)
+                    {
+                        int atom = atoms[j]+mol.instances[i];
+                        posq[atom].x -= dx;
+                        posq[atom].y -= dy;
+                        posq[atom].z -= dz;
+                        gpu->posCellOffsets[atom].x -= xcell;
+                        gpu->posCellOffsets[atom].y -= ycell;
+                        gpu->posCellOffsets[atom].z -= zcell;
+                    }
+                }
+            }
+        }
+        // Select a bin for each molecule, then sort them by bin.
+        bool useHilbert = (numMolecules > 5000 || atoms.size() > 8); // For small systems, a simple zigzag curve works better than a Hilbert curve.
+        float binWidth;
+        if (useHilbert)
+            binWidth = (float)(max(max(maxx-minx, maxy-miny), maxz-minz)/255.0);
+        else
+            binWidth = (float)(0.2*sqrt(gpu->sim.nonbondedCutoffSqr));
+        int xbins = 1 + (int) ((maxx-minx)/binWidth);
+        int ybins = 1 + (int) ((maxy-miny)/binWidth);
+        vector<pair<int, int> > molBins(numMolecules);
+        bitmask_t coords[3];
+        for (int i = 0; i < numMolecules; i++)
+        {
+            int x = (int) ((molPos[i].x-minx)/binWidth);
+            int y = (int) ((molPos[i].y-miny)/binWidth);
+            int z = (int) ((molPos[i].z-minz)/binWidth);
+            int bin;
+            if (useHilbert)
+            {
+                coords[0] = x;
+                coords[1] = y;
+                coords[2] = z;
+                bin = (int) hilbert_c2i(3, 8, coords);
+            }
+            else
+            {
+                int yodd = y&1;
+                int zodd = z&1;
+                bin = z*xbins*ybins;
+                bin += (zodd ? ybins-y : y)*xbins;
+                bin += (yodd ? xbins-x : x);
+            }
+            molBins[i] = pair<int, int>(bin, i);
+        }
+        sort(molBins.begin(), molBins.end());
+        // Reorder the atoms.
+        for (int i = 0; i < numMolecules; i++)
+        {
+            for (int j = 0; j < (int)atoms.size(); j++)
+            {
+                int oldIndex = mol.instances[molBins[i].second]+atoms[j];
+                int newIndex = mol.instances[i]+atoms[j];
+                originalIndex[newIndex] = (*gpu->psAtomIndex)[oldIndex];
+                newPosq[newIndex] = posq[oldIndex];
+                newVelm[newIndex] = velm[oldIndex];
+                newCellOffsets[newIndex] = gpu->posCellOffsets[oldIndex];
+            }
+        }
+    }
+    // Update the streams.
+    for (int i = 0; i < numAtoms; i++) {
+        posq[i] = newPosq[i];
+        velm[i] = newVelm[i];
+        (*gpu->psAtomIndex)[i] = originalIndex[i];
+        gpu->posCellOffsets[i] = newCellOffsets[i];
+    }
+    gpu->psPosq4->Upload();
+    gpu->psVelm4->Upload();
+    gpu->psAtomIndex->Upload();
+}
--- a/platforms/cuda/src/kernels/gputypes.h
+++ b/platforms/cuda/src/kernels/gputypes.h
-Vim: Warning: Output is not to a terminal
+#ifndef __GPUTYPES_H__
-[?1049h[?1h=[1;59r[?12;25h[?12l[?25h[27m[m[H[2J[?25l[59;1H"svn-commit.tmp" 15L, 601C[1;1H[33m  1 
+#define __GPUTYPES_H__
-  2 [m[32m--This line, and those below, will be ignored--[m
-[33m  3 
+/* -------------------------------------------------------------------------- *
-  4 [m[1m[35mM    plugins/amoeba/platforms/cuda/src/AmoebaCudaKernelFactory.cpp[m
+ *                                   OpenMM                                   *
-[33m  5 [m[1m[35mM    plugins/freeEnergy/platforms/reference/src/gbsa/CpuGBVISoftcore.cpp[m
+ * -------------------------------------------------------------------------- *
-[33m  6 [m[1m[35mM    openmmapi/include/openmm/GBVIForce.h[m
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
-[33m  7 [m[1m[35mM    openmmapi/src/GBVIForce.cpp[m
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
-[33m  8 [m[1m[35mM    olla/src/Platform.cpp[m
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
-[33m  9 [m[1m[35mM    platforms/opencl/src/OpenCLContext.h[m
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
-[33m 10 [m[1m[35mM    platforms/cuda/src/CudaKernels.cpp[m
+ *                                                                            *
-[33m 11 [m[1m[35mM    platforms/cuda/src/kernels/kCalculateGBVIBornSum.cu[m
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
-[33m 12 [m[1m[35mM    platforms/cuda/src/kernels/gputypes.h[m
+ * Authors: Scott Le Grand, Peter Eastman                                     *
-[33m 13 [m[1m[35mM    platforms/cuda/src/kernels/cudatypes.h[m
+ * Contributors:                                                              *
-[33m 14 [m[1m[35mM    platforms/cuda/src/kernels/kForces.cu[m
+ *                                                                            *
-[33m 15 [m[1m[35mM    platforms/cuda/src/kernels/gpu.cpp[m
+ * This program is free software: you can redistribute it and/or modify       *
-[1m[34m~                                                                                                                                                                                                                                         [17;1H~                                                                                                                                                                                                                                         [18;1H~                                                                                                                                                                                                                                         [19;1H~                                                                                                                                                                                                                                         [20;1H~                                                                                                                                                                                                                                         [21;1H~                                                                                                                                                                                                                                         [22;1H~                                                                                                                                                                                                                                         [23;1H~                                                                                                                                                                                                                                         [24;1H~                                                                                                                                                                                                                                         [25;1H~                                                                                                                                                                                                                                         [26;1H~                                                                                                                                                                                                                                         [27;1H~                                                                                                                                                                                                                                         [28;1H~                                                                                                                                                                                                                                         [29;1H~                                                                                                                                                                                                                                         [30;1H~                                                                                                                                                                                                                                         [31;1H~                                                                                                                                                                                                                                         [32;1H~                                                                                                                                                                                                                                         [33;1H~                                                                                                                                                                                                                                         [34;1H~                                                                                                                                                                                                                                         [35;1H~                                                                                                                                                                                                                                         [36;1H~                                                                                                                                                                                                                                         [37;1H~                                                                                                                                                                                                                                         [38;1H~                                                                                                                                                                                                                                         [39;1H~                                                                                                                                                                                                                                         [40;1H~                                                                                                                                                                                                                                         [41;1H~                                                                                                                                                                                                                                         [42;1H~                                                                                                                                                                                                                                         [43;1H~                                                                                                                                                                                                                                         [44;1H~                                                                                                                                                                                                                                         [45;1H~                                                                                                                                                                                                                                         [46;1H~                                                                                                                                                                                                                                         [47;1H~                                                                                                                                                                                                                                         [48;1H~                                                                                                                                                                                                                                         [49;1H~                                                                                                                                                                                                                                         [50;1H~                                                                                                                                                                                                                                         [51;1H~                                                                                                                                                                                                                                         [52;1H~                                                                                                                                                                                                                                         [53;1H~                                                                                                                                                                                                                                         [54;1H~                                                                                                                                                                                                                                         [55;1H~                                                                                                                                                                                                                                         [56;1H~                                                                                                                                                                                                                                         [57;1H~                                                                                                                                                                                                                                         [58;1H~                                                                                                                                                                                                                                         [m[59;217H1,0-1[9CAll[1;5H[?12l[?25h[?25l[59;1H[K[59;1H:[?12l[?25hq![?25l[59;1H[K[59;1H[?1l>[?12l[?25h[?1049l
+ * it under the terms of the GNU Lesser General Public License as published   *
-Log message unchanged or not specified
+ * by the Free Software Foundation, either version 3 of the License, or       *
-a)bort, c)ontinue, e)dit
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "cudatypes.h"
+#include "cudaCompact.h"
+#include <vector>
+#include "windowsExportCuda.h"
+namespace OpenMM {
+    class CudaForceInfo;
+}
+struct gpuAtomType {
+    std::string name;
+    char symbol;
+    float r;
+};
+struct gpuMoleculeGroup {
+    std::vector<int> atoms;
+    std::vector<int> instances;
+};
+struct gpuTabulatedFunction {
+    gpuTabulatedFunction() : coefficients(NULL) {
+    }
+    std::string name;
+    double min, max;
+    CUDAStream<float4>* coefficients;
+};
+enum SM_VERSION
+{
+    SM_10,
+    SM_11,
+    SM_12,
+    SM_20
+};
+/* Pointer to this structure will be given 
+ * to gromacs functions*/
+struct _gpuContext {
+    //Cache this here so that it doesn't
+    //have to be repeatedly passed around
+    int natoms;
+    int device;
+    bool useBlockingSync;
+    gpuAtomType* gpAtomTable;
+    int gAtomTypes;
+    unsigned int blocksPerSM;
+    unsigned int sharedMemoryPerBlock;
+    cudaGmxSimulation sim;
+    unsigned int* pOutputBufferCounter;
+    std::vector<OpenMM::CudaForceInfo*> forces;
+    std::vector<std::vector<int> > exclusions;
+    unsigned char* pAtomSymbol;
+    std::vector<gpuMoleculeGroup> moleculeGroups;
+    gpuTabulatedFunction tabulatedFunctions[MAX_TABULATED_FUNCTIONS];
+    std::vector<int3> posCellOffsets;
+    int iterations;
+    float epsfac;
+    float solventDielectric;
+    float soluteDielectric;
+    int grid;
+    bool bCalculateCM;
+    bool bRemoveCM;
+    bool bRecalculateBornRadii;
+    bool bOutputBufferPerWarp;
+    bool bIncludeGBSA;
+    bool bIncludeGBVI;
+    bool tabulatedFunctionsChanged;
+    unsigned long seed;
+    SM_VERSION sm_version;
+    compactionPlan compactPlan;
+    cufftHandle fftplan;
+    CUDAStream<float4>* psPosq4;
+    CUDAStream<float4>* psPosqP4;
+    CUDAStream<float4>* psOldPosq4;
+    CUDAStream<float4>* psVelm4;
+    CUDAStream<float4>* psForce4;
+    CUDAStream<float>*  psEnergy;           // Energy output buffer
+    CUDAStream<float2>* psSigEps2; 
+    CUDAStream<float4>* psCustomParams;     // Atom parameters for custom nonbonded force
+    CUDAStream<int4>* psCustomBondID;             // Atom indices for custom bonds
+    CUDAStream<float4>* psCustomBondParams;       // Parameters for custom bonds
+    CUDAStream<int4>* psCustomAngleID1;           // Atom indices for custom angles
+    CUDAStream<int2>* psCustomAngleID2;           // Atom indices for custom angles
+    CUDAStream<float4>* psCustomAngleParams;      // Parameters for custom angles
+    CUDAStream<int4>* psCustomTorsionID1;           // Atom indices for custom torsions
+    CUDAStream<int4>* psCustomTorsionID2;           // Atom indices for custom torsions
+    CUDAStream<float4>* psCustomTorsionParams;      // Parameters for custom torsions
+    CUDAStream<int>* psCustomExternalID;          // Atom indices for custom external force
+    CUDAStream<float4>* psCustomExternalParams;   // Parameters for custom external force
+    CUDAStream<float4>* psTabulatedFunctionParams; // The min, max, and spacing for each tabulated function
+    CUDAStream<float2>* psEwaldCosSinSum;
+    CUDAStream<float>* psTabulatedErfc;     // Tabulated values for erfc()
+    CUDAStream<cufftComplex>* psPmeGrid;    // Grid points for particle mesh Ewald
+    CUDAStream<float>* psPmeBsplineModuli[3];
+    CUDAStream<float4>* psPmeBsplineTheta;
+    CUDAStream<float4>* psPmeBsplineDtheta;
+    CUDAStream<int>* psPmeAtomRange;           // The range of sorted atoms at each grid point
+    CUDAStream<int2>* psPmeAtomGridIndex;      // The grid point each atom is at
+    CUDAStream<float2>* psObcData;
+    CUDAStream<float4>* psGBVIData;
+    CUDAStream<float>* psGBVISwitchDerivative;
+    CUDAStream<float>* psObcChain;
+    CUDAStream<float>* psBornForce;
+    CUDAStream<float>* psBornRadii;
+    CUDAStream<float>* psBornSum;
+    CUDAStream<int4>* psBondID;
+    CUDAStream<float2>* psBondParameter;
+    CUDAStream<int4>* psBondAngleID1;
+    CUDAStream<int2>* psBondAngleID2;
+    CUDAStream<float2>* psBondAngleParameter;
+    CUDAStream<int4>* psDihedralID1;
+    CUDAStream<int4>* psDihedralID2;
+    CUDAStream<float4>* psDihedralParameter;
+    CUDAStream<int4>* psRbDihedralID1;
+    CUDAStream<int4>* psRbDihedralID2;
+    CUDAStream<float4>* psRbDihedralParameter1;
+    CUDAStream<float2>* psRbDihedralParameter2;
+    CUDAStream<int4>* psLJ14ID;
+    CUDAStream<float4>* psLJ14Parameter;
+    CUDAStream<int4>* psShakeID;
+    CUDAStream<float4>* psShakeParameter;
+    CUDAStream<int4>* psSettleID;
+    CUDAStream<float2>* psSettleParameter;
+    CUDAStream<unsigned int>* psExclusion;
+    CUDAStream<unsigned int>* psExclusionIndex;
+    CUDAStream<unsigned int>* psWorkUnit;
+    CUDAStream<unsigned int>* psInteractingWorkUnit;
+    CUDAStream<unsigned int>* psInteractionFlag;
+    CUDAStream<size_t>* psInteractionCount;
+    CUDAStream<float2>* psStepSize;         // The size of the previous and current time steps
+    CUDAStream<float>* psLangevinParameters;// Parameters used for Langevin integration
+    CUDAStream<float4>* psRandom4;          // Pointer to sets of 4 random numbers for MD integration
+    CUDAStream<float2>* psRandom2;          // Pointer to sets of 2 random numbers for MD integration
+    CUDAStream<uint4>* psRandomSeed;        // Pointer to each random seed
+    CUDAStream<int>* psRandomPosition;      // Pointer to random number positions
+    CUDAStream<float4>* psLinearMomentum;   // Pointer to total linear momentum per CTA
+    CUDAStream<int>* psAtomIndex;           // The original index of each atom
+    CUDAStream<float4>* psGridBoundingBox;  // The size of each grid cell
+    CUDAStream<float4>* psGridCenter;       // The center and radius for each grid cell
+    CUDAStream<int2>* psCcmaAtoms;          // The atoms connected by each CCMA constraint
+    CUDAStream<float4>* psCcmaDistance;     // The displacement vector (x, y, z) and constraint distance (w) for each CCMA constraint
+    CUDAStream<int>* psCcmaAtomConstraints; // The indices of constraints involving each atom
+    CUDAStream<int>* psCcmaNumAtomConstraints; // The number of constraints involving each atom
+    CUDAStream<float>* psCcmaDelta1;        // Workspace for CCMA
+    CUDAStream<float>* psCcmaDelta2;        // Workspace for CCMA
+    int* ccmaConvergedHostMarker;           // Host memory used to communicate that CCMA has converged
+    cudaEvent_t ccmaEvent;                  // Used to optimize communication during CCMA
+    CUDAStream<float>* psCcmaReducedMass;   // The reduced mass for each CCMA constraint
+    CUDAStream<float>* psRigidClusterMatrix;// The inverse constraint matrix for each rigid cluster
+    CUDAStream<unsigned int>* psRigidClusterConstraintIndex; // The index of each cluster in the stream containing cluster constraints.
+    CUDAStream<unsigned int>* psRigidClusterMatrixIndex; // The index of each cluster in the stream containing cluster matrices.
+    CUDAStream<unsigned int>* psConstraintMatrixColumn; // The column of each element in the constraint matrix.
+    CUDAStream<float>* psConstraintMatrixValue; // The value of each element in the constraint matrix.
+};
+typedef struct _gpuContext *gpuContext;
+// Function prototypes
+extern "C"
+bool gpuIsAvailable();
+extern "C"
+void gpuSetBondParameters(gpuContext gpu, const std::vector<int>& atom1, const std::vector<int>& atom2, const std::vector<float>& length, const std::vector<float>& k);
+extern "C"
+void gpuSetBondAngleParameters(gpuContext gpu, const std::vector<int>& atom1, const std::vector<int>& atom2, const std::vector<int>& atom3,
+        const std::vector<float>& angle, const std::vector<float>& k);
+extern "C"
+void gpuSetDihedralParameters(gpuContext gpu, const std::vector<int>& atom1, const std::vector<int>& atom2, const std::vector<int>& atom3, const std::vector<int>& atom4,
+        const std::vector<float>& k, const std::vector<float>& phase, const std::vector<int>& periodicity);
+extern "C"
+void gpuSetRbDihedralParameters(gpuContext gpu, const std::vector<int>& atom1, const std::vector<int>& atom2, const std::vector<int>& atom3, const std::vector<int>& atom4,
+        const std::vector<float>& c0, const std::vector<float>& c1, const std::vector<float>& c2, const std::vector<float>& c3, const std::vector<float>& c4, const std::vector<float>& c5);
+extern "C"
+void gpuSetLJ14Parameters(gpuContext gpu, float epsfac, float fudge, const std::vector<int>& atom1, const std::vector<int>& atom2,
+        const std::vector<float>& c6, const std::vector<float>& c12, const std::vector<float>& q1, const std::vector<float>& q2);
+extern "C"
+void gpuSetCoulombParameters(gpuContext gpu, float epsfac, const std::vector<int>& atom, const std::vector<float>& c6, const std::vector<float>& c12, const std::vector<float>& q,
+        const std::vector<char>& symbol, const std::vector<std::vector<int> >& exclusions, CudaNonbondedMethod method);
+extern "C"
+void gpuSetNonbondedCutoff(gpuContext gpu, float cutoffDistance, float solventDielectric);
+extern "C"
+void gpuSetTabulatedFunction(gpuContext gpu, int index, const std::string& name, const std::vector<double>& values, double min, double max);
+extern "C"
+void gpuSetCustomBondParameters(gpuContext gpu, const std::vector<int>& bondAtom1, const std::vector<int>& bondAtom2, const std::vector<std::vector<double> >& bondParams,
+            const std::string& energyExp, const std::vector<std::string>& paramNames, const std::vector<std::string>& globalParamNames);
+extern "C"
+void gpuSetCustomAngleParameters(gpuContext gpu, const std::vector<int>& angleAtom1, const std::vector<int>& angleAtom2, const std::vector<int>& angleAtom3, const std::vector<std::vector<double> >& angleParams,
+            const std::string& energyExp, const std::vector<std::string>& paramNames, const std::vector<std::string>& globalParamNames);
+extern "C"
+void gpuSetCustomTorsionParameters(gpuContext gpu, const std::vector<int>& torsionAtom1, const std::vector<int>& torsionAtom2, const std::vector<int>& torsionAtom3, const std::vector<int>& torsionAtom4, const std::vector<std::vector<double> >& torsionParams,
+            const std::string& energyExp, const std::vector<std::string>& paramNames, const std::vector<std::string>& globalParamNames);
+extern "C"
+void gpuSetCustomExternalParameters(gpuContext gpu, const std::vector<int>& atomIndex, const std::vector<std::vector<double> >& atomParams,
+            const std::string& energyExp, const std::vector<std::string>& paramNames, const std::vector<std::string>& globalParamNames);
+extern "C"
+void gpuSetCustomNonbondedParameters(gpuContext gpu, const std::vector<std::vector<double> >& parameters, const std::vector<std::vector<int> >& exclusions,
+            CudaNonbondedMethod method, float cutoffDistance, const std::string& energyExp,
+            const std::vector<std::string>& paramNames, const std::vector<std::string>& globalParamNames);
+extern "C"
+void gpuSetEwaldParameters(gpuContext gpu, float alpha, int kmaxx, int kmaxy, int kmaxz);
+extern "C"
+void gpuSetPMEParameters(gpuContext gpu, float alpha, int gridSizeX, int gridSizeY, int gridSizeZ);
+extern "C"
+void OPENMMCUDA_EXPORT gpuSetPeriodicBoxSize(gpuContext gpu, float xsize, float ysize, float zsize);
+extern "C"
+void gpuSetObcParameters(gpuContext gpu, float innerDielectric, float solventDielectric, const std::vector<float>& radius, const std::vector<float>& scale, const std::vector<float>& charge);
+extern "C" 
+void gpuSetGBVIParameters(gpuContext gpu, float innerDielectric, float solventDielectric, const std::vector<int>& atom, const std::vector<float>& radius,
+                          const std::vector<float>& gammas, const std::vector<float>& scaledRadii,
+                          int bornRadiusScalingMethod, float quinticLowerLimitFactor, float quinticUpperBornRadiusLimit);
+extern "C"
+void gpuSetConstraintParameters(gpuContext gpu, const std::vector<int>& atom1, const std::vector<int>& atom2, const std::vector<float>& distance,
+        const std::vector<float>& invMass1, const std::vector<float>& invMass2, float constraintTolerance);
+extern "C"
+int gpuAllocateInitialBuffers(gpuContext gpu);
+extern "C"
+void gpuSetPositions(gpuContext gpu, const std::vector<float>& x, const std::vector<float>& y, const std::vector<float>& z);
+extern "C"
+void gpuSetVelocities(gpuContext gpu, const std::vector<float>& x, const std::vector<float>& y, const std::vector<float>& z);
+extern "C"
+void gpuSetMass(gpuContext gpu, const std::vector<float>& mass);
+extern "C"
+void OPENMMCUDA_EXPORT gpuInitializeRandoms(gpuContext gpu);
+extern "C"
+OPENMMCUDA_EXPORT void* gpuInit(int numAtoms, unsigned int device = 0, bool useBlockingSync = false);
+extern "C"
+void gpuSetLangevinIntegrationParameters(gpuContext gpu, float tau, float deltaT, float temperature, float errorTol);
+extern "C"
+void gpuSetVerletIntegrationParameters(gpuContext gpu, float deltaT, float errorTol);
+extern "C"
+void gpuSetBrownianIntegrationParameters(gpuContext gpu, float tau, float deltaT, float temperature);
+extern "C"
+void gpuSetAndersenThermostatParameters(gpuContext gpu, float temperature, float collisionFrequency);
+extern "C"
+void gpuShutDown(gpuContext gpu);
+extern "C"
+int gpuBuildOutputBuffers(gpuContext gpu);
+extern "C"
+int gpuBuildThreadBlockWorkList(gpuContext gpu);
+extern "C"
+void gpuBuildExclusionList(gpuContext gpu);
+extern "C"
+int OPENMMCUDA_EXPORT gpuSetConstants(gpuContext gpu);
+extern "C"
+void gpuReorderAtoms(gpuContext gpu);
+extern "C"
+void setExclusions(gpuContext gpu, const std::vector<std::vector<int> >& exclusions);
+#endif //__GPUTYPES_H__
--- a/platforms/cuda/src/kernels/kCalculateGBVIBornSum.cu
+++ b/platforms/cuda/src/kernels/kCalculateGBVIBornSum.cu
-Vim: Warning: Output is not to a terminal
+/* -------------------------------------------------------------------------- *
-[?1049h[?1h=[1;59r[?12;25h[?12l[?25h[27m[m[H[2J[?25l[59;1H"svn-commit.tmp" 15L, 601C[1;1H[33m  1 
+ *                                   OpenMM                                   *
-  2 [m[32m--This line, and those below, will be ignored--[m
+ * -------------------------------------------------------------------------- *
-[33m  3 
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
-  4 [m[1m[35mM    plugins/amoeba/platforms/cuda/src/AmoebaCudaKernelFactory.cpp[m
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
-[33m  5 [m[1m[35mM    plugins/freeEnergy/platforms/reference/src/gbsa/CpuGBVISoftcore.cpp[m
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
-[33m  6 [m[1m[35mM    openmmapi/include/openmm/GBVIForce.h[m
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
-[33m  7 [m[1m[35mM    openmmapi/src/GBVIForce.cpp[m
+ *                                                                            *
-[33m  8 [m[1m[35mM    olla/src/Platform.cpp[m
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
-[33m  9 [m[1m[35mM    platforms/opencl/src/OpenCLContext.h[m
+ * Authors: Scott Le Grand, Peter Eastman                                     *
-[33m 10 [m[1m[35mM    platforms/cuda/src/CudaKernels.cpp[m
+ * Contributors:                                                              *
-[33m 11 [m[1m[35mM    platforms/cuda/src/kernels/kCalculateGBVIBornSum.cu[m
+ *                                                                            *
-[33m 12 [m[1m[35mM    platforms/cuda/src/kernels/gputypes.h[m
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
-[33m 13 [m[1m[35mM    platforms/cuda/src/kernels/cudatypes.h[m
+ * copy of this software and associated documentation files (the "Software"), *
-[33m 14 [m[1m[35mM    platforms/cuda/src/kernels/kForces.cu[m
+ * to deal in the Software without restriction, including without limitation  *
-[33m 15 [m[1m[35mM    platforms/cuda/src/kernels/gpu.cpp[m
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
-[1m[34m~                                                                                                                                                                                                                                         [17;1H~                                                                                                                                                                                                                                         [18;1H~                                                                                                                                                                                                                                         [19;1H~                                                                                                                                                                                                                                         [20;1H~                                                                                                                                                                                                                                         [21;1H~                                                                                                                                                                                                                                         [22;1H~                                                                                                                                                                                                                                         [23;1H~                                                                                                                                                                                                                                         [24;1H~                                                                                                                                                                                                                                         [25;1H~                                                                                                                                                                                                                                         [26;1H~                                                                                                                                                                                                                                         [27;1H~                                                                                                                                                                                                                                         [28;1H~                                                                                                                                                                                                                                         [29;1H~                                                                                                                                                                                                                                         [30;1H~                                                                                                                                                                                                                                         [31;1H~                                                                                                                                                                                                                                         [32;1H~                                                                                                                                                                                                                                         [33;1H~                                                                                                                                                                                                                                         [34;1H~                                                                                                                                                                                                                                         [35;1H~                                                                                                                                                                                                                                         [36;1H~                                                                                                                                                                                                                                         [37;1H~                                                                                                                                                                                                                                         [38;1H~                                                                                                                                                                                                                                         [39;1H~                                                                                                                                                                                                                                         [40;1H~                                                                                                                                                                                                                                         [41;1H~                                                                                                                                                                                                                                         [42;1H~                                                                                                                                                                                                                                         [43;1H~                                                                                                                                                                                                                                         [44;1H~                                                                                                                                                                                                                                         [45;1H~                                                                                                                                                                                                                                         [46;1H~                                                                                                                                                                                                                                         [47;1H~                                                                                                                                                                                                                                         [48;1H~                                                                                                                                                                                                                                         [49;1H~                                                                                                                                                                                                                                         [50;1H~                                                                                                                                                                                                                                         [51;1H~                                                                                                                                                                                                                                         [52;1H~                                                                                                                                                                                                                                         [53;1H~                                                                                                                                                                                                                                         [54;1H~                                                                                                                                                                                                                                         [55;1H~                                                                                                                                                                                                                                         [56;1H~                                                                                                                                                                                                                                         [57;1H~                                                                                                                                                                                                                                         [58;1H~                                                                                                                                                                                                                                         [m[59;217H1,0-1[9CAll[1;5H[?12l[?25h[?25l[59;1H[K[59;1H:[?12l[?25hq![?25l[59;1H[K[59;1H[?1l>[?12l[?25h[?1049l
+ * and/or sell copies of the Software, and to permit persons to whom the      *
-Log message unchanged or not specified
+ * Software is furnished to do so, subject to the following conditions:       *
-a)bort, c)ontinue, e)dit
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+#include <stdio.h>
+#include <cuda.h>
+#include <vector_functions.h>
+#include <cstdlib>
+#include <string>
+#include <iostream>
+#include <fstream>
+using namespace std;
+#include "gputypes.h"
+#define UNROLLXX 0
+#define UNROLLXY 0
+struct Atom {
+    float x;
+    float y;
+    float z;
+    float r;
+    float sr;
+    float sum;
+    float gamma;
+};
+static __constant__ cudaGmxSimulation cSim;
+void SetCalculateGBVIBornSumSim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
+}
+void GetCalculateGBVIBornSumSim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
+}
+// Include versions of the kernels for N^2 calculations.
+#define METHOD_NAME(a, b) a##N2##b
+#include "kCalculateGBVIBornSum.h"
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##N2ByWarp##b
+#include "kCalculateGBVIBornSum.h"
+// Include versions of the kernels with cutoffs.
+#undef METHOD_NAME
+#undef USE_OUTPUT_BUFFER_PER_WARP
+#define USE_CUTOFF
+#define METHOD_NAME(a, b) a##Cutoff##b
+#include "kCalculateGBVIBornSum.h"
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##CutoffByWarp##b
+#include "kCalculateGBVIBornSum.h"
+// Include versions of the kernels with periodic boundary conditions.
+#undef METHOD_NAME
+#undef USE_OUTPUT_BUFFER_PER_WARP
+#define USE_PERIODIC
+#define METHOD_NAME(a, b) a##Periodic##b
+#include "kCalculateGBVIBornSum.h"
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##PeriodicByWarp##b
+#include "kCalculateGBVIBornSum.h"
+__global__ void kReduceGBVIBornSum_kernel()
+{
+    unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
+    while (pos < cSim.atoms)
+    {
+        float sum = 0.0f;
+        float* pSt = cSim.pBornSum + pos;
+        float4 atom = cSim.pGBVIData[pos];
+        // Get summed Born data
+        for (int i = 0; i < cSim.nonbondOutputBuffers; i++)
+        {
+            sum += *pSt;
+       //     printf("%4d %4d A: %9.4f\n", pos, i, *pSt);
+            pSt += cSim.stride;
+        }
+        // Now calculate Born radius
+        float Rinv           = 1.0f/atom.x;
+        sum                  = Rinv*Rinv*Rinv - sum; 
+        cSim.pBornRadii[pos] = pow( sum, (-1.0f/3.0f) ); 
+        pos += gridDim.x * blockDim.x;
+    }   
+}
+void kReduceGBVIBornSum(gpuContext gpu)
+{
+    //printf("kReduceGBVIBornSum\n");
+#define GBVI_DEBUG 0
+#if ( GBVI_DEBUG == 1 )
+               gpu->psGBVIData->Download();
+               gpu->psBornSum->Download();
+               gpu->psPosq4->Download();
+                (void) fprintf( stderr, "\nkReduceGBVIBornSum: Post BornSum %s Born radii & params\n", 
+                               (gpu->bIncludeGBVI ? "GBVI" : "Obc") );
+                for( int ii = 0; ii < gpu->natoms; ii++ ){
+                   (void) fprintf( stderr, "%d bSum=%14.6e param[%14.6e %14.6e %14.6e] x[%14.6f %14.6f %14.6f %14.6f]\n",
+                                   ii, 
+                                   gpu->psBornSum->_pSysStream[0][ii],
+                                   gpu->psGBVIData->_pSysStream[0][ii].x,
+                                   gpu->psGBVIData->_pSysStream[0][ii].y,
+                                   gpu->psGBVIData->_pSysStream[0][ii].z,
+                                   gpu->psPosq4->_pSysStream[0][ii].x, gpu->psPosq4->_pSysStream[0][ii].y,
+                                   gpu->psPosq4->_pSysStream[0][ii].z, gpu->psPosq4->_pSysStream[0][ii].w
+                                 );  
+                }   
+#endif
+#undef GBVI_DEBUG
+    kReduceGBVIBornSum_kernel<<<gpu->sim.blocks, 384>>>();
+    gpu->bRecalculateBornRadii = false;
+    LAUNCHERROR("kReduceGBVIBornSum");
+}
+void kCalculateGBVIBornSum(gpuContext gpu)
+{
+    //printf("kCalculateGBVIBornSum\n");
+    //size_t numWithInteractions;
+    switch (gpu->sim.nonbondedMethod)
+    {
+        case NO_CUTOFF:
+#define GBVI 0
+#if GBVI == 1
+int maxPrint = 10;
+gpu->psWorkUnit->Download();
+fprintf( stderr, "kCalculateGBVIBornSum: bOutputBufferPerWarp=%u blks=%u th/blk=%u wu=%u %u shrd=%u\n", gpu->bOutputBufferPerWarp,
+                 gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, gpu->sim.workUnits, gpu->psWorkUnit->_pSysStream[0][0],
+        sizeof(Atom)*gpu->sim.nonbond_threads_per_block );
+               gpu->psGBVIData->Download();
+               gpu->psBornSum->Download();
+               gpu->psPosq4->Download();
+                (void) fprintf( stderr, "\nkCalculateGBVIBornSum: pre BornSum %s Born radii & params\n",
+                               (gpu->bIncludeGBVI ? "GBVI" : "Obc") );
+                for( int ii = 0; ii < gpu->natoms; ii++ ){
+                   (void) fprintf( stderr, "%d bSum=%14.6e param[%14.6e %14.6e %14.6e] x[%14.6f %14.6f %14.6f %14.6f]\n",
+                                   ii, 
+                                   gpu->psBornSum->_pSysStream[0][ii],
+                                   gpu->psGBVIData->_pSysStream[0][ii].x,
+                                   gpu->psGBVIData->_pSysStream[0][ii].y,
+                                   gpu->psGBVIData->_pSysStream[0][ii].z,
+                                   gpu->psPosq4->_pSysStream[0][ii].x, gpu->psPosq4->_pSysStream[0][ii].y,
+                                   gpu->psPosq4->_pSysStream[0][ii].z, gpu->psPosq4->_pSysStream[0][ii].w
+                                 );
+                   if( (ii == maxPrint) && ( ii < (gpu->natoms - maxPrint)) ){
+                      ii = gpu->natoms - maxPrint;
+                   }
+                }
+#endif
+#undef GBVI
+            if (gpu->bOutputBufferPerWarp){
+                kCalculateGBVIN2ByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
+            } else {
+                kCalculateGBVIN2BornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
+            }
+            break;
+        case CUTOFF:
+            if (gpu->bOutputBufferPerWarp)
+                kCalculateGBVICutoffByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
+            else
+                kCalculateGBVICutoffBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit );
+            break;
+        case PERIODIC:
+            if (gpu->bOutputBufferPerWarp)
+                kCalculateGBVIPeriodicByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit );
+            else
+                kCalculateGBVIPeriodicBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit );
+            break;
+    }
+    LAUNCHERROR("kCalculateGBVIBornSum");
+}