Deleted lots of obsolete code.

ff0a69f9 · Peter Eastman · a56ce87d · ff0a69f9 · ff0a69f9 · ff0a69f9
Commit ff0a69f9 authored Mar 24, 2009 by Peter Eastman
7 changed files
--- a/platforms/cuda/src/kernels/cudaKernels.h
+++ b/platforms/cuda/src/kernels/cudaKernels.h
@@ -39,10 +39,7 @@ extern void kGenerateRandoms(gpuContext gpu);
 // Main loop
 extern void kCalculateCDLJObcGbsaForces1(gpuContext gpu);
-extern void kCalculateCDLJObcGbsaForces1_12(gpuContext gpu);
 extern void kCalculateCDLJForces(gpuContext gpu);
-extern void kCalculateObcGbsaForces1(gpuContext gpu);
-extern void kCalculateObcGbsaForces1_12(gpuContext gpu);
 extern void kReduceObcGbsaBornForces(gpuContext gpu);
 extern void kCalculateObcGbsaForces2(gpuContext gpu);
 extern void kCalculateLocalForces(gpuContext gpu);
@@ -72,10 +69,6 @@ extern void SetCalculateLocalForcesSim(gpuContext gpu);
 extern void GetCalculateLocalForcesSim(gpuContext gpu);
 extern void SetCalculateObcGbsaBornSumSim(gpuContext gpu);
 extern void GetCalculateObcGbsaBornSumSim(gpuContext gpu);
-extern void SetCalculateObcGbsaForces1Sim(gpuContext gpu);
-extern void GetCalculateObcGbsaForces1Sim(gpuContext gpu);
-extern void SetCalculateObcGbsaForces1_12Sim(gpuContext gpu);
-extern void GetCalculateObcGbsaForces1_12Sim(gpuContext gpu);
 extern void SetCalculateObcGbsaForces2Sim(gpuContext gpu);
 extern void GetCalculateObcGbsaForces2Sim(gpuContext gpu);
 extern void SetCalculateAndersenThermostatSim(gpuContext gpu);

--- a/platforms/cuda/src/kernels/gpu.cpp
+++ b/platforms/cuda/src/kernels/gpu.cpp
@@ -55,78 +55,6 @@ using namespace std;
 using OpenMM::OpenMMException;
-#ifdef WIN32
-  typedef unsigned __int64 u64;
-  typedef signed __int64 s64;
-#else
-  typedef uint64_t u64;
-  typedef int64_t s64;
-#endif
-typedef unsigned int u32;
-typedef float f32;
-typedef double f64;
-typedef char ascii;
-typedef char utf8;
-typedef unsigned char u8;
-typedef signed char s8;
-typedef unsigned short u16;
-typedef signed short s16;
-typedef struct
-{
-  u8 type[4];
-  f32 charge;
-  f32 radius;
-} FAH_ATOM;
-typedef struct
-{
-  u32 a; /* rule: a < b */
-  u32 b;
-} FAH_BOND;
-typedef struct
-{
-  f32 x;
-  f32 y;
-  f32 z;
-} FAH_XYZ;
-typedef struct
-{
-  u32    magic;
-  u32    version;
-  utf8   name[64];
-  s64    timestamp;
-  u64    iterations;
-  u32    frames;
-  u32    atom_count;
-  u32    bond_count;
-  /* v2 */
-  utf8   user_name[64];
-  utf8   user_team[16];
-  utf8   user_done[16];
-} FAH_INFO;
-typedef struct
-{
-  u32    magic;
-  u32    version;
-  s64    timestamp;
-  u64    iterations_done;
-  u32    frames_done;
-  f32    energy;
-  f32    temperature;
-} FAH_CURRENT;
-typedef struct
-{
-  FAH_INFO info;
-  FAH_CURRENT current;
-  FAH_ATOM * atoms;
-  FAH_BOND * bonds;
-  FAH_XYZ  * xyz;
-} PROTEIN;
 struct ShakeCluster {
    int centralID;
    int peripheralID[3];
@@ -196,43 +124,6 @@ static const float BOLTZ                    =    (RGAS / KILO);            // (k
 #define DeltaShake
-extern "C"
-int gpuReadBondParameters(gpuContext gpu, char* fname)
-{
-    ifstream infile(fname);
-    if (!infile.fail())
-    {
-        char buff[512];
-        int bonds;
-        infile >> bonds;
-        infile.getline(buff, 512);
-        vector<int> atom1(bonds);
-        vector<int> atom2(bonds);
-        vector<float> length(bonds);
-        vector<float> k(bonds);
-        for (int i = 0; i < bonds; i++)
-        {
-            int junk;
-            infile >> 
-                junk >> 
-                atom1[i] >> 
-                atom2[i] >> 
-                length[i] >> 
-                k[i];
-        }
-        gpuSetBondParameters(gpu, atom1, atom2, length, k);
-        return bonds;
-    }
-    else
-    {
-        cout << "Error opening harmonic bond parameter file " << fname << endl;
-        exit(-1);
-    }
-    return 0;
-}
 extern "C"
 void gpuSetBondParameters(gpuContext gpu, const vector<int>& atom1, const vector<int>& atom2, const vector<float>& length, const vector<float>& k)
 {
@@ -268,45 +159,6 @@ void gpuSetBondParameters(gpuContext gpu, const vector<int>& atom1, const vector
    psBondParameter->Upload();
 }
-extern "C"
-int gpuReadBondAngleParameters(gpuContext gpu, char* fname)
-{
-    ifstream infile(fname);
-    if (!infile.fail())
-    {
-        char buff[512];
-        int bond_angles;
-        infile >> bond_angles;
-        infile.getline(buff, 512);
-        vector<int> atom1(bond_angles);
-        vector<int> atom2(bond_angles);
-        vector<int> atom3(bond_angles);
-        vector<float> angle(bond_angles);
-        vector<float> k(bond_angles);
-        for (int i = 0; i < bond_angles; i++)
-        {
-            int junk;
-            infile >> 
-                junk >> 
-                atom1[i] >> 
-                atom2[i] >> 
-                atom3[i] >> 
-                angle[i] >> 
-                k[i];
-        }
-        gpuSetBondAngleParameters(gpu, atom1, atom2, atom3, angle, k);
-        return bond_angles;
-    }
-    else
-    {
-        cout << "Error opening harmonic bond angle parameter file " << fname << endl;
-        exit(-1);
-    }
-    return 0;
-}
 extern "C"
 void gpuSetBondAngleParameters(gpuContext gpu, const vector<int>& atom1, const vector<int>& atom2, const vector<int>& atom3,
        const vector<float>& angle, const vector<float>& k)
@@ -352,48 +204,6 @@ void gpuSetBondAngleParameters(gpuContext gpu, const vector<int>& atom1, const v
    psBondAngleParameter->Upload();
 }
-extern "C"
-int gpuReadDihedralParameters(gpuContext gpu, char* fname)
-{
-    ifstream infile(fname);
-    if (!infile.fail())
-    {
-        char buff[512];
-        int dihedrals;
-        infile >> dihedrals;
-        infile.getline(buff, 512);
-        vector<int> atom1(dihedrals);
-        vector<int> atom2(dihedrals);
-        vector<int> atom3(dihedrals);
-        vector<int> atom4(dihedrals);
-        vector<float> k(dihedrals);
-        vector<float> phase(dihedrals);
-        vector<int> periodicity(dihedrals);
-        for (int i = 0; i < dihedrals; i++)
-        {
-            int junk;
-            infile >> 
-                junk >> 
-                atom1[i] >> 
-                atom2[i] >> 
-                atom3[i] >>
-                atom4[i] >> 
-                k[i] >> 
-                phase[i] >>
-                periodicity[i];
-        }
-        gpuSetDihedralParameters(gpu, atom1, atom2, atom3, atom4, k, phase, periodicity);
-        return dihedrals;
-    }
-    else
-    {
-        cout << "Error opening dihedral parameter file " << fname << endl;
-        exit(-1);
-    }
-    return 0;
-}
 extern "C"
 void gpuSetDihedralParameters(gpuContext gpu, const vector<int>& atom1, const vector<int>& atom2, const vector<int>& atom3, const vector<int>& atom4,
        const vector<float>& k, const vector<float>& phase, const vector<int>& periodicity)
@@ -443,68 +253,6 @@ void gpuSetDihedralParameters(gpuContext gpu, const vector<int>& atom1, const ve
        psDihedralParameter->Upload();
 }
-extern "C"
-int gpuReadRbDihedralParameters(gpuContext gpu, char* fname)
-{
-    ifstream infile(fname);
-    if (!infile.fail())
-    {
-        char buff[512];
-        int rb_dihedrals;
-        infile >> rb_dihedrals;
-        infile.getline(buff, 512);
-        vector<int> atom1(rb_dihedrals);
-        vector<int> atom2(rb_dihedrals);
-        vector<int> atom3(rb_dihedrals);
-        vector<int> atom4(rb_dihedrals);
-        vector<float> c0(rb_dihedrals);
-        vector<float> c1(rb_dihedrals);
-        vector<float> c2(rb_dihedrals);
-        vector<float> c3(rb_dihedrals);
-        vector<float> c4(rb_dihedrals);
-        vector<float> c5(rb_dihedrals);
-        gpu->sim.rb_dihedrals = rb_dihedrals;
-        CUDAStream<int4>* psRbDihedralID1           = new CUDAStream<int4>(rb_dihedrals, 1);
-        gpu->psRbDihedralID1                        = psRbDihedralID1;
-        gpu->sim.pRbDihedralID1                     = psRbDihedralID1->_pDevStream[0];
-        CUDAStream<int4>* psRbDihedralID2           = new CUDAStream<int4>(rb_dihedrals, 1);
-        gpu->psRbDihedralID2                        = psRbDihedralID2;
-        gpu->sim.pRbDihedralID2                     = psRbDihedralID2->_pDevStream[0];
-        CUDAStream<float4>* psRbDihedralParameter1  = new CUDAStream<float4>(rb_dihedrals, 1);
-        gpu->psRbDihedralParameter1                 = psRbDihedralParameter1;
-        gpu->sim.pRbDihedralParameter1              = psRbDihedralParameter1->_pDevStream[0];
-        CUDAStream<float2>* psRbDihedralParameter2  = new CUDAStream<float2>(rb_dihedrals, 1);    
-        gpu->psRbDihedralParameter2                 = psRbDihedralParameter2;
-        gpu->sim.pRbDihedralParameter2              = psRbDihedralParameter2->_pDevStream[0];
-        for (int i = 0; i < rb_dihedrals; i++)
-        {
-            int junk;
-            infile >> 
-                junk >> 
-                atom1[i] >> 
-                atom2[i] >> 
-                atom3[i] >>
-                atom4[i] >> 
-                c0[i] >> 
-                c1[i] >> 
-                c2[i] >> 
-                c3[i] >> 
-                c4[i] >> 
-                c5[i];
-        }
-        gpuSetRbDihedralParameters(gpu, atom1, atom2, atom3, atom4, c0, c1, c2, c3, c4, c5);
-        return rb_dihedrals;
-    }
-    else
-    {
-        cout << "Error opening Ryckaert-Bellemans dihedral parameter file " << fname << endl;
-        exit(-1);
-    }
-    return 0;
-}
 extern "C"
 void gpuSetRbDihedralParameters(gpuContext gpu, const vector<int>& atom1, const vector<int>& atom2, const vector<int>& atom3, const vector<int>& atom4,
        const vector<float>& c0, const vector<float>& c1, const vector<float>& c2, const vector<float>& c3, const vector<float>& c4, const vector<float>& c5)
@@ -566,56 +314,6 @@ void gpuSetRbDihedralParameters(gpuContext gpu, const vector<int>& atom1, const
    psRbDihedralParameter2->Upload();
 }
-extern "C"
-int gpuReadLJ14Parameters(gpuContext gpu, char* fname)
-{
-    ifstream infile(fname);
-    if (!infile.fail())
-    {
-        char buff[1024];
-        float epsfac = 0.0f;
-        float fudge = 0.0f;
-        int LJ14s;
-        infile >> LJ14s;
-        infile.get(buff, 61);
-        // cout << buff << endl;
-        infile >> epsfac;
-        infile.get(buff, 8);
-        infile >> fudge;
-        infile.getline(buff, 512);
-        // cout << buff << endl;
-        vector<int> atom1(LJ14s);
-        vector<int> atom2(LJ14s);
-        vector<float> c6(LJ14s);
-        vector<float> c12(LJ14s);
-        vector<float> q1(LJ14s);
-        vector<float> q2(LJ14s);
-        for (int i = 0; i < LJ14s; i++)
-        {
-            int junk;
-            infile >> 
-                junk >> 
-                atom1[i] >> 
-                atom2[i] >> 
-                c6[i] >> 
-                c12[i] >>
-                q1[i] >>
-                q2[i];
-        }
-        gpuSetLJ14Parameters(gpu, epsfac, fudge, atom1, atom2, c6, c12, q1, q2);
-        return LJ14s;
-    }
-    else
-    {
-        cout << "Error opening Lennard-Jones 1-4 parameter file " << fname << endl;
-        exit(-1);
-    }
-    return 0;
-}
 extern "C"
 void gpuSetLJ14Parameters(gpuContext gpu, float epsfac, float fudge, const vector<int>& atom1, const vector<int>& atom2,
        const vector<float>& c6, const vector<float>& c12, const vector<float>& q1, const vector<float>& q2)
@@ -672,158 +370,6 @@ void gpuSetLJ14Parameters(gpuContext gpu, float epsfac, float fudge, const vecto
    psLJ14Parameter->Upload();
 }
-extern "C"
-float gpuGetAtomicRadius(gpuContext gpu, string s)
-{
-    for (int i = 0; i < gpu->gAtomTypes; i++)
-    {
-        if (s == gpu->gpAtomTable[i].name)
-        {
-            return gpu->gpAtomTable[i].r;
-        }
-    }
-    return 0.0f;
-}
-extern "C"
-unsigned char gpuGetAtomicSymbol(gpuContext gpu, string s)
-{
-    for (int i = 0; i < gpu->gAtomTypes; i++)
-    {
-        if (s == gpu->gpAtomTable[i].name)
-        {
-            return gpu->gpAtomTable[i].symbol;
-        }
-    }
-    return ' ';
-}
-extern "C"
-int gpuReadAtomicParameters(gpuContext gpu, char* fname)
-{
-    gpu->gAtomTypes = 0;
-    if (gpu->gpAtomTable)
-        delete[] gpu->gpAtomTable;
-    // Read file once to count atom types
-    ifstream infile(fname);
-    if (!infile.fail())
-    {
-        char buff[1024];
-        int skips = 0;
-        bool skipflag = true;
-        while (infile.getline(buff, 512))
-        {
-            if (buff[0] == ' ')
-            {
-                skipflag = false;
-                gpu->gAtomTypes++;
-            }
-            else if (skipflag)
-                skips++;
-        }
-        infile.close();
-        gpu->gpAtomTable = new gpuAtomType[gpu->gAtomTypes];
-        ifstream infile1(fname);
-        for (int i = 0; i < skips; i++)
-        {
-            infile1.getline(buff, 512);
-        }
-        for (int i = 0; i < gpu->gAtomTypes; i++)
-        {
-            infile1 >> gpu->gpAtomTable[i].name >> gpu->gpAtomTable[i].r;
-            infile1.getline(buff, 512);
-            // Determine symbol
-            if (gpu->gpAtomTable[i].r < 1.3f)
-                gpu->gpAtomTable[i].symbol = 'H';
-            else if (gpu->gpAtomTable[i].r < 1.6f)
-                gpu->gpAtomTable[i].symbol = 'O';
-            else if (gpu->gpAtomTable[i].r < 1.7f)
-                gpu->gpAtomTable[i].symbol = 'N';
-            else
-                gpu->gpAtomTable[i].symbol = 'C';
-#if (DUMP_PARAMETERS == 1)            
-            cout << i << " " << gpu->gpAtomTable[i].name << " " << gpu->gpAtomTable[i].symbol << " " << gpu->gpAtomTable[i].r << endl; 
-#endif
-        }
-        return gpu->gAtomTypes;
-    }
-    else
-    {
-        cout << "Error opening atom parameter file " << fname << endl;
-        exit(-1);
-    }
-    return 0;   
-}
-extern "C"
-int gpuReadCoulombParameters(gpuContext gpu, char* fname)
-{
-    ifstream infile(fname);
-    if (!infile.fail())
-    {
-        char buff[1024];
-        unsigned int coulombs;
-        float fudge = 0.0f;
-        float epsfac = 1.0f;
-        infile >> coulombs;
-        infile.get(buff, 9);
-        infile >> epsfac;
-        infile.get(buff, 8);
-        infile >> fudge;
-        infile.getline(buff, 512);
-        vector<int> atom(coulombs);
-        vector<float> c6(coulombs);
-        vector<float> c12(coulombs);
-        vector<float> q(coulombs);
-        vector<float> radius(coulombs);
-        vector<float> scale(coulombs);
-        vector<char> symbol(coulombs);
-        vector<vector<int> > exclusions(coulombs);
-        unsigned int total_exclusions = 0;
-        for (unsigned int i = 0; i < coulombs; i++)
-        {
-            int junk, numExclusions;
-            char atype[512];
-            infile >> 
-                junk >> 
-                c6[i] >>
-                c12[i] >>
-                q[i] >>
-                atype >>
-                scale[i] >>
-                numExclusions;
-                radius[i] = gpuGetAtomicRadius(gpu, atype);
-                symbol[i] = gpuGetAtomicSymbol(gpu, atype);
-                for (int j = 0; j < numExclusions; j++)
-                {
-                    int exclusion;
-                    infile >> exclusion;
-                    exclusions[i].push_back(exclusion);
-                }
-        }
-        cout << total_exclusions << " total exclusions.\n";
-        gpuSetCoulombParameters(gpu, epsfac, atom, c6, c12, q, symbol, exclusions, NO_CUTOFF);
-        gpuSetObcParameters(gpu, defaultInnerDielectric, defaultSolventDielectric, atom, radius, scale);
-        return coulombs;
-    }
-    else
-    {
-        cout << "Error opening Coulomb parameter file " << fname << endl;
-        exit(-1);
-    }
-    return 0;
-}
 extern "C"
 void gpuSetCoulombParameters(gpuContext gpu, float epsfac, const vector<int>& atom, const vector<float>& c6, const vector<float>& c12, const vector<float>& q,
        const vector<char>& symbol, const vector<vector<int> >& exclusions, CudaNonbondedMethod method)
@@ -911,44 +457,6 @@ void gpuSetObcParameters(gpuContext gpu, float innerDielectric, float solventDie
    gpu->sim.preFactor = 2.0f*electricConstant*((1.0f/innerDielectric)-(1.0f/solventDielectric))*gpu->sim.forceConversionFactor;
 }
-extern "C"
-int gpuReadShakeParameters(gpuContext gpu, char* fname)
-{
-    ifstream infile(fname);
-    if (!infile.fail())
-    {
-        char buff[512];
-        int shake_constraints;
-        infile >> buff >> shake_constraints;
-        infile.getline(buff, 512);
-        vector<int> atom1(shake_constraints);
-        vector<int> atom2(shake_constraints);
-        vector<float> distance(shake_constraints);
-        vector<float> invMass1(shake_constraints);
-        vector<float> invMass2(shake_constraints);
-        for (int i = 0; i < shake_constraints; i++)
-        {
-            int junk;
-            infile >> 
-                junk >> 
-                atom1[i] >> 
-                atom2[i] >> 
-                distance[i] >> 
-                invMass1[i] >> 
-                invMass2[i];
-        }
-        gpuSetShakeParameters(gpu, atom1, atom2, distance, invMass1, invMass2, 1e-4f);
-        return gpu->sim.ShakeConstraints;
-    }
-    else
-    {
-        cout << "Error opening Shake parameter file " << fname << endl;
-        exit(-1);
-    }
-    return 0;
-}
 extern "C"
 void gpuSetShakeParameters(gpuContext gpu, const vector<int>& atom1, const vector<int>& atom2, const vector<float>& distance,
        const vector<float>& invMass1, const vector<float>& invMass2, float tolerance)
@@ -1230,45 +738,6 @@ int gpuAllocateInitialBuffers(gpuContext gpu)
    return 1;
 }
-extern "C"
-void gpuReadCoordinates(gpuContext gpu, char* fname)
-{
-    ifstream infile(fname);
-    gpu->natoms = 0;
-    char buff[512];
-    infile >> buff >> gpu->natoms;
-    infile.getline(buff, 511);
-    float totalMass = 0.0f;
-    gpuAllocateInitialBuffers(gpu);
-    for (int i = 0; i < gpu->natoms; i++)
-    {
-        int junk;
-        infile >> junk >> 
-            gpu->psPosq4->_pSysStream[0][i].x >> 
-            gpu->psPosq4->_pSysStream[0][i].y >> 
-            gpu->psPosq4->_pSysStream[0][i].z >>
-            gpu->psPosq4->_pSysStream[0][i].w >>
-            gpu->psVelm4->_pSysStream[0][i].x >> 
-            gpu->psVelm4->_pSysStream[0][i].y >> 
-            gpu->psVelm4->_pSysStream[0][i].z >>
-            gpu->psVelm4->_pSysStream[0][i].w;
-        gpu->psxVector4->_pSysStream[0][i].x = 0.0f;
-        gpu->psxVector4->_pSysStream[0][i].y = 0.0f;
-        gpu->psxVector4->_pSysStream[0][i].z = 0.0f;
-        gpu->psxVector4->_pSysStream[0][i].w = 0.0f;
-        // Accumulate mass
-        totalMass += 1.0f / gpu->psVelm4->_pSysStream[0][i].w;
-    }
-    gpu->sim.inverseTotalMass = 1.0f / totalMass;
-    gpu->psPosq4->Upload();
-    gpu->psVelm4->Upload();
-    gpu->psxVector4->Upload();
-}
 extern "C"
 void gpuSetPositions(gpuContext gpu, const vector<float>& x, const vector<float>& y, const vector<float>& z)
 {
@@ -1341,37 +810,6 @@ bool gpuIsAvailable()
    return (deviceCount > 0);
 }
-extern "C"
-void* gpuInitFromFile(char* fname) 
-{
-    ifstream infile(fname);
-    int numAtoms = 0;
-    char buff[512];
-    infile >> buff >> numAtoms;
-    gpuContext gpu = (gpuContext) gpuInit(numAtoms);
-    vector<float> x(numAtoms), y(numAtoms), z(numAtoms), charge(numAtoms), vx(numAtoms), vy(numAtoms), vz(numAtoms), mass(numAtoms);
-    infile.getline(buff, 511);
-    float totalMass = 0.0f;
-    for (int i = 0; i < gpu->natoms; i++)
-    {
-        int junk;
-        infile >> junk >> 
-            x[i] >> 
-            y[i] >> 
-            z[i] >> 
-            charge[i] >> 
-            vx[i] >> 
-            vy[i] >> 
-            vz[i] >> 
-            mass[i];
-        mass[i] = 1.0f/mass[i];
-    }
-    gpuSetPositions(gpu, x, y, z);
-    gpuSetVelocities(gpu, vx, vy, vz);
-    gpuSetMass(gpu, mass);
-    return (void*)gpu;
-}
 extern "C"
 void* gpuInit(int numAtoms)
 {
@@ -1979,7 +1417,6 @@ int gpuSetConstants(gpuContext gpu)
    SetCalculateCDLJObcGbsaForces1Sim(gpu);
    SetCalculateLocalForcesSim(gpu);
    SetCalculateObcGbsaBornSumSim(gpu);
-    SetCalculateObcGbsaForces1Sim(gpu);
    SetCalculateObcGbsaForces2Sim(gpu);
    SetCalculateAndersenThermostatSim(gpu);
    SetForcesSim(gpu);
@@ -1988,866 +1425,9 @@ int gpuSetConstants(gpuContext gpu)
    SetBrownianUpdateSim(gpu);
    SetSettleSim(gpu);
    SetRandomSim(gpu);
-    if (gpu->sm_version >= SM_12)
-    {
-        SetCalculateObcGbsaForces1_12Sim(gpu);
-    }
    return 1;
 }
-extern "C"
-void gpuDumpCoordinates(gpuContext gpu)
-{
-    gpu->psPosq4->Download();
-    gpu->psVelm4->Download();
-    (void) printf( "\n\nCoordinates and velocities\n" );
-    for (int i = 0; i < gpu->natoms; i++)
-    {
-        printf("%4d: %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f\n", i, 
-            gpu->psPosq4->_pSysStream[0][i].x,
-            gpu->psPosq4->_pSysStream[0][i].y,
-            gpu->psPosq4->_pSysStream[0][i].z,
-            gpu->psPosq4->_pSysStream[0][i].w,
-            gpu->psVelm4->_pSysStream[0][i].x,
-            gpu->psVelm4->_pSysStream[0][i].y,
-            gpu->psVelm4->_pSysStream[0][i].z,
-            gpu->psVelm4->_pSysStream[0][i].w
-        );
-    }
-}
-bool ISNAN(float f)
-{
-    return !(f == f);
-}
-extern "C"
-bool gpuCheckData(gpuContext gpu)
-{
-    gpu->psPosq4->Download();
-    gpu->psVelm4->Download();
-    gpu->psForce4->Download();
-    gpu->psBornForce->Download();
-    int violations = 0;
-    for (int i = 0; i < gpu->natoms; i++)
-    {
-        if (ISNAN( gpu->psPosq4->_pSysStream[0][i].x) ||
-            ISNAN( gpu->psPosq4->_pSysStream[0][i].y) ||
-            ISNAN( gpu->psPosq4->_pSysStream[0][i].z) ||
-            ISNAN( gpu->psVelm4->_pSysStream[0][i].x) ||
-            ISNAN( gpu->psVelm4->_pSysStream[0][i].y) ||
-            ISNAN( gpu->psVelm4->_pSysStream[0][i].z) ||
-            ISNAN( gpu->psForce4->_pSysStream[0][i].x) ||
-            ISNAN( gpu->psForce4->_pSysStream[0][i].y) ||
-            ISNAN( gpu->psForce4->_pSysStream[0][i].z) ||
-            ISNAN( gpu->psBornForce->_pSysStream[0][i]))
-        {
-            printf("%4d: %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f\n", i, 
-                gpu->psPosq4->_pSysStream[0][i].x,
-                gpu->psPosq4->_pSysStream[0][i].y,
-                gpu->psPosq4->_pSysStream[0][i].z,
-                gpu->psVelm4->_pSysStream[0][i].x,
-                gpu->psVelm4->_pSysStream[0][i].y,
-                gpu->psVelm4->_pSysStream[0][i].z,
-                gpu->psForce4->_pSysStream[0][i].x,
-                gpu->psForce4->_pSysStream[0][i].y,
-                gpu->psForce4->_pSysStream[0][i].z,
-                gpu->psBornForce->_pSysStream[0][i]
-            );
-            violations++;
-        }
-    }
-    if (violations > 0)
-    {
-        printf("%d total violations\n", violations);
-        for (int i = 0; i < gpu->natoms; i++)
-        {
-            float dmin = 99999999.0f;
-            int closest = -9999;
-            float x = gpu->psPosq4->_pSysStream[0][i].x;
-            float y = gpu->psPosq4->_pSysStream[0][i].y;
-            float z = gpu->psPosq4->_pSysStream[0][i].z;
-            for (int j = 0; j < gpu->natoms; j++)
-            {
-                if (j != i)
-                {
-                    float dx = gpu->psPosq4->_pSysStream[0][j].x - x;
-                    float dy = gpu->psPosq4->_pSysStream[0][j].y - y;
-                    float dz = gpu->psPosq4->_pSysStream[0][j].z - z;
-                    float r = sqrt(dx * dx + dy * dy + dz * dz);
-                    if (r < dmin)
-                    {
-                        dmin = r;
-                        closest = j;
-                    }
-                }
-            }
-            printf("Atom %4d: Closest neighbor is Atom %4d, %11.5e\n", i, closest, dmin);
-        }
-        gpuDumpAtomData(gpu);
-        kClearBornForces(gpu);
-        kClearForces(gpu);
-        kCPUCalculateLocalForces(gpu);
-        // Determine which forces have gone awry
-        kClearBornForces(gpu);
-        kClearForces(gpu);
-        kCalculateCDLJForces(gpu);
-        kReduceForces(gpu);
-        printf("Nonbond Forces\n");
-        gpuDumpForces(gpu);
-        kClearBornForces(gpu);
-        kClearForces(gpu);
-        kCalculateObcGbsaForces1(gpu);
-        kReduceObcGbsaBornForces(gpu);
-        kCalculateObcGbsaForces2(gpu); 
-        kReduceForces(gpu);
-        printf("OBC Forces\n");
-        gpuDumpForces(gpu);
-        kClearBornForces(gpu);
-        kClearForces(gpu);
-        kCalculateLocalForces(gpu);
-        kReduceForces(gpu);
-        printf("Local Forces\n");
-        gpuDumpForces(gpu);
-        kClearBornForces(gpu);
-        kClearForces(gpu);
-        kReduceForces(gpu);
-        printf("Cleared Forces\n");
-        gpuDumpForces(gpu);
-        return false;
-    }
-    return true;
-}
-extern "C"
-void kCPUCalculate14(gpuContext gpu)
-{
-    gpu->psPosq4->Download();
-    gpu->psForce4->Download();
- //   gpu->psLJ14ID->Download();
- //   gpu->psLJ14Parameter->Download();
-    for (int pos = 0; pos < (int) gpu->sim.LJ14s; pos++)
-    {
-        int4 atom               = gpu->psLJ14ID->_pSysStream[0][pos];
-        float4 LJ14             = gpu->psLJ14Parameter->_pSysStream[0][pos];
-        float4 a1               = gpu->psPosq4->_pSysStream[0][atom.x];
-        float4 a2               = gpu->psPosq4->_pSysStream[0][atom.y];
-        float3 d;
-        d.x                     = a1.x - a2.x;
-        d.y                     = a1.y - a2.y;
-        d.z                     = a1.z - a2.z;
-        float r2                = d.x * d.x + d.y * d.y + d.z * d.z;
-        float inverseR          = 1.0f / sqrt(r2);
-        float sig2              = inverseR * LJ14.y;
-        sig2                   *= sig2;
-        float sig6              = sig2 * sig2 * sig2;
-        float dEdR              = LJ14.x * (12.0f * sig6 - 6.0f) * sig6;
-        dEdR                   += LJ14.z * inverseR;
-        dEdR                   *= inverseR * inverseR;
-        unsigned int offsetA    = atom.x + atom.z * gpu->sim.stride;
-        unsigned int offsetB    = atom.y + atom.w * gpu->sim.stride;
-        float4 forceA           = gpu->psForce4->_pSysStream[0][offsetA];
-        float4 forceB           = gpu->psForce4->_pSysStream[0][offsetB];
-        d.x                    *= dEdR;
-        d.y                    *= dEdR;
-        d.z                    *= dEdR;
-        forceA.x               += d.x;
-        forceA.y               += d.y;
-        forceA.z               += d.z;
-        forceB.x               -= d.x;
-        forceB.y               -= d.y;
-        forceB.z               -= d.z;        
-        gpu->psForce4->_pSysStream[0][offsetA]   = forceA;
-        gpu->psForce4->_pSysStream[0][offsetB]   = forceB;
-        printf("%4d: %4d - %4d: %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f\n", pos, atom.x, atom.y, r2, dEdR, sig2, sig6, LJ14.x, LJ14.z); 
-    }        
-}
-extern "C"
-void gpuDumpPrimeCoordinates(gpuContext gpu)
-{
-    gpu->psPosqP4->Download();
-    for (int i = 0; i < gpu->natoms; i++)
-    {
-        printf("%4d: %11.5f %11.5f %11.5f %11.5f\n", i, 
-            gpu->psPosqP4->_pSysStream[0][i].x,
-            gpu->psPosqP4->_pSysStream[0][i].y,
-            gpu->psPosqP4->_pSysStream[0][i].z,
-            gpu->psPosqP4->_pSysStream[0][i].w
-        );
-    }
-}
-extern "C"
-void gpuDumpForces(gpuContext gpu)
-{
-    gpu->psForce4->Download();
-    gpu->psBornForce->Download();
-    for (int i = 0; i < gpu->natoms; i++)
-    {
-        char buff[512];
-        sprintf(buff, "%4d: %11.5f %11.5f %11.5f %11.5f\n", i, 
-            gpu->psForce4->_pSysStream[0][i].x,
-            gpu->psForce4->_pSysStream[0][i].y,
-            gpu->psForce4->_pSysStream[0][i].z,
-            gpu->psBornForce->_pSysStream[0][i]
-        );
-//        OutputDebugString(buff);
-    }
-}
-extern "C"
-void gpuDumpAtomData(gpuContext gpu)
-{
-    gpu->psPosq4->Download();
-    gpu->psSigEps2->Download();
-    gpu->psBornRadii->Download();
-    gpu->psObcChain->Download();
-    for (int i = 0; i < gpu->natoms; i++)
-    {
-        char buff[512];
-        sprintf(buff, "%4d: %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f\n", i, 
-            gpu->psPosq4->_pSysStream[0][i].x,
-            gpu->psPosq4->_pSysStream[0][i].y,
-            gpu->psPosq4->_pSysStream[0][i].z,
-            gpu->psPosq4->_pSysStream[0][i].w,
-            gpu->psSigEps2->_pSysStream[0][i].x,
-            gpu->psSigEps2->_pSysStream[0][i].y,
-            gpu->psBornRadii->_pSysStream[0][i],
-            gpu->psObcChain->_pSysStream[0][i]
-        );
-//        OutputDebugString((LPCWSTR)buff);
-    }
-}
-extern "C"
-void gpuSetup(void* pVoid)
-{            
-    gpuContext gpu = (gpuContext)pVoid;
-    // Read parameters
-    cout << gpuReadAtomicParameters(gpu, "Data/atomicradii.txt") << " atom types\n";
-    cout << gpuReadBondParameters(gpu, "Data/GromacsHarmonicBondParameter.txt") << " bond parameters.\n";
-    cout << gpuReadBondAngleParameters(gpu, "Data/GromacsAngleBondParameter.txt") << " bond angle parameters.\n";
-    cout << gpuReadDihedralParameters(gpu, "Data/GromacsProperDihedralParameter.txt") << " proper dihedral parameters.\n";
-    cout << gpuReadRbDihedralParameters(gpu, "Data/GromacsRbDihedralParameter.txt") << " Ryckaert-Bellemans dihedral parameters.\n";
-    cout << gpuReadLJ14Parameters(gpu, "Data/GromacsLJ14Parameter.txt") << " Lennard-Jones 1-4 parameters.\n";
-    cout << gpuReadCoulombParameters(gpu, "Data/GromacsLJCoulombParameter.txt") << " Coulomb parameters.\n";
-    cout << gpuReadShakeParameters(gpu, "Data/GromacsShakeParameters.txt") << " shake parameters.\n";
-    // Build thread block work list
-    gpuBuildThreadBlockWorkList(gpu);
-    // Build exclusion list
-    gpuBuildExclusionList(gpu);
-    // Create output buffers
-    gpuBuildOutputBuffers(gpu);
-    // Set constant blocks
-    gpuSetConstants(gpu);
-    // Initialize randoms
-    gpuInitializeRandoms(gpu);
-    // Initialize Born Radii;
-    kCalculateObcGbsaBornSum(gpu);
-    kReduceObcGbsaBornSum(gpu);
-    kClearForces(gpu);
-    kClearBornForces(gpu);
-    return;
-}
-#define DOT3(v1, v2) (v1.x * v2.x + v1.y * v2.y + v1.z * v2.z)
-#define GETNORMEDDOTPRODUCT(v1, v2, dp) \
-{ \
-    dp          = DOT3(v1, v2); \
-    float norm1 = DOT3(v1, v1); \
-    float norm2 = DOT3(v2, v2); \
-    dp /= sqrt(norm1 * norm2); \
-    dp = min(dp, 1.0f); \
-    dp = max(dp, -1.0f); \
-}
-#define CROSS_PRODUCT(v1, v2, c) \
-    c.x = v1.y * v2.z - v1.z * v2.y; \
-    c.y = v1.z * v2.x - v1.x * v2.z; \
-    c.z = v1.x * v2.y - v1.y * v2.x;
-#define GETPREFACTORSGIVENANGLECOSINE(cosine, param, dEdR) \
-{ \
-   float angle          = acos(cosine); \
-   float deltaIdeal     = angle - (param.x * (3.14159265f / 180.0f)); \
-   dEdR                 = param.y * deltaIdeal; \
-}
-#define GETANGLEBETWEENTWOVECTORS(v1, v2, angle) \
-{ \
-    float dp; \
-    GETNORMEDDOTPRODUCT(v1, v2, dp); \
-    angle = acos(dp); \
-}
-#define GETANGLECOSINEBETWEENTWOVECTORS(v1, v2, angle, cosine) \
-{ \
-    GETNORMEDDOTPRODUCT(v1, v2, cosine); \
-    angle = acos(cosine); \
-}
-#define GETDIHEDRALANGLEBETWEENTHREEVECTORS(vector1, vector2, vector3, signVector, cp0, cp1, angle) \
-{ \
-    CROSS_PRODUCT(vector1, vector2, cp0); \
-    CROSS_PRODUCT(vector2, vector3, cp1); \
-    GETANGLEBETWEENTWOVECTORS(cp0, cp1, angle); \
-    float dp = DOT3(signVector, cp1); \
-    angle = (dp >= 0) ? angle : -angle; \
-}                                                          
-#define GETDIHEDRALANGLECOSINEBETWEENTHREEVECTORS(vector1, vector2, vector3, signVector, cp0, cp1, angle, cosine) \
-{ \
-    CROSS_PRODUCT(vector1, vector2, cp0); \
-    CROSS_PRODUCT(vector2, vector3, cp1); \
-    GETANGLECOSINEBETWEENTWOVECTORS(cp0, cp1, angle, cosine); \
-    float dp = DOT3(signVector, cp1); \
-    angle = (dp >= 0) ? angle : -angle; \
-}    
-// Calculate Local forces on CPU
-extern "C"
-void kCPUCalculateLocalForces(gpuContext gpu)
-{
-    gpu->psPosq4->Download();
-    gpu->psForce4->Download();
-    gpu->psBondID->Download();
-    gpu->psBondParameter->Download();
-    gpu->psBondAngleID1->Download();
-    gpu->psBondAngleID2->Download();
-    gpu->psBondAngleParameter->Download();
-    gpu->psDihedralID1->Download();
-    gpu->psDihedralID2->Download();
-    gpu->psDihedralParameter->Download();
-    gpu->psRbDihedralID1->Download();
-    gpu->psRbDihedralID2->Download();
-    gpu->psRbDihedralParameter1->Download();
-    gpu->psRbDihedralParameter2->Download();
-    gpu->psLJ14ID->Download();
-    gpu->psLJ14Parameter->Download();
-    unsigned int pos = 0;
-    Vectors V;
-    Vectors* A = &V;
-    int violations = 0;
-    while (pos < gpu->sim.bond_offset)
-    {
-        if (pos < gpu->sim.bonds)
-        {
-            int4   atom         = gpu->psBondID->_pSysStream[0][pos];
-            float4 atomA        = gpu->psPosq4->_pSysStream[0][atom.x];
-            float4 atomB        = gpu->psPosq4->_pSysStream[0][atom.y];
-            float2 bond         = gpu->psBondParameter->_pSysStream[0][pos];
-            float dx            = atomB.x - atomA.x;
-            float dy            = atomB.y - atomA.y;
-            float dz            = atomB.z - atomA.z;
-            float r2            = dx * dx + dy * dy + dz * dz;
-            float r             = sqrt(r2);
-            float deltaIdeal    = r - bond.x;
-            float dEdR          = bond.y * deltaIdeal;
-            dEdR                = (r > 0.0f) ? (dEdR / r) : 0.0f;
-            if (fabs(deltaIdeal) > 1.0f)
-            {
-                printf("Bond %4d: %11.4f %11.4f %11.4f %11.4f %11.4f %11.4f\n", pos, dx, dy, dz, r, deltaIdeal, dEdR);
-                violations++;
-            }
-            dx                 *= dEdR;
-            dy                 *= dEdR;
-            dz                 *= dEdR;
-            unsigned int offsetA                = atom.x + atom.z * gpu->sim.stride;
-            unsigned int offsetB                = atom.y + atom.w * gpu->sim.stride;
-            float4 forceA                       = gpu->psForce4->_pSysStream[0][offsetA];
-            float4 forceB                       = gpu->psForce4->_pSysStream[0][offsetB];
-            forceA.x                           += dx;
-            forceA.y                           += dy;
-            forceA.z                           += dz;
-            forceB.x                           -= dx;
-            forceB.y                           -= dy;
-            forceB.z                           -= dz;
-            gpu->psForce4->_pSysStream[0][offsetA]               = forceA;
-            gpu->psForce4->_pSysStream[0][offsetB]               = forceB;    
-        }
-        pos++;
-    }
-#if 0  
-    while (pos < gpu->sim.bond_angle_offset)
-    {
-        unsigned int pos1   = pos - gpu->sim.bond_offset;
-        if (pos1 < gpu->sim.bond_angles)
-        {
-            int4   atom1        = gpu->psBondAngleID1->_pSysStream[0][pos1];  
-            float2 bond_angle   = gpu->psBondAngleParameter->_pSysStream[0][pos1];
-            float4 a1           = gpu->psPosq4->_pSysStream[0][atom1.x];
-            float4 a2           = gpu->psPosq4->_pSysStream[0][atom1.y];
-            float4 a3           = gpu->psPosq4->_pSysStream[0][atom1.z];
-            A->v0.x = a2.x - a1.x;
-            A->v0.y = a2.y - a1.y;
-            A->v0.z = a2.z - a1.z;
-            A->v1.x = a2.x - a3.x;
-            A->v1.y = a2.y - a3.y;
-            A->v1.z = a2.z - a3.z;
-            float3 cp;
-            CROSS_PRODUCT(A->v0, A->v1, cp);
-            float rp = DOT3(cp, cp); //cx * cx + cy * cy + cz * cz;
-            rp = max(sqrt(rp), 1.0e-06f);
-            float r21       = DOT3(A->v0, A->v0); // dx1 * dx1 + dy1 * dy1 + dz1 * dz1;
-            float r23       = DOT3(A->v1, A->v1); // dx2 * dx2 + dy2 * dy2 + dz2 * dz2;
-            float dot       = DOT3(A->v0, A->v1); // dx1 * dx2 + dy1 * dy2 + dz1 * dz2;
-            float cosine    = dot / sqrt(r21 * r23);
-            float dEdR;
-            GETPREFACTORSGIVENANGLECOSINE(cosine, bond_angle, dEdR);
-            printf("Bond angle %4d %11.4f %11.4f\n", pos1, cosine, dEdR);
-            float termA =  dEdR / (r21 * rp);
-            float termC = -dEdR / (r23 * rp);
-            float3 c21;
-            float3 c23;
-            CROSS_PRODUCT(A->v0, cp, c21);
-            CROSS_PRODUCT(A->v1, cp, c23);
-            c21.x *= termA;
-            c21.y *= termA;
-            c21.z *= termA;
-            c23.x *= termC;
-            c23.y *= termC;
-            c23.z *= termC;
-            int2 atom2 = gpu->psBondAngleID2->_pSysStream[0][pos1];
-            unsigned int offset = atom1.x + atom1.w * gpu->sim.stride;
-            float4 force = gpu->psForce4->_pSysStream[0][offset]; 
-            force.x += c21.x;
-            force.y += c21.y;
-            force.z += c21.z;
-            gpu->psForce4->_pSysStream[0][offset] = force;
-            offset = atom1.y + atom2.x * gpu->sim.stride;
-            force = gpu->psForce4->_pSysStream[0][offset];
-            force.x -= (c21.x + c23.x);
-            force.y -= (c21.y + c23.y);
-            force.z -= (c21.z + c23.z);
-            gpu->psForce4->_pSysStream[0][offset] = force;
-            offset = atom1.z + atom2.y * gpu->sim.stride;
-            force = gpu->psForce4->_pSysStream[0][offset];
-            force.x += c23.x;
-            force.y += c23.y;
-            force.z += c23.z;
-            gpu->psForce4->_pSysStream[0][offset] = force;
-        }
-        pos++;
-    }
-    while (pos < gpu->sim.dihedral_offset)
-    {
-        unsigned int pos1 = pos - gpu->sim.bond_angle_offset;
-        if (pos1 < gpu->sim.dihedrals)
-        {
-            int4   atom1        = gpu->psDihedralID1->_pSysStream[0][pos1];  
-            float4 atomA        = gpu->psPosq4->_pSysStream[0][atom1.x];
-            float4 atomB        = gpu->psPosq4->_pSysStream[0][atom1.y];
-            float4 atomC        = gpu->psPosq4->_pSysStream[0][atom1.z];
-            float4 atomD        = gpu->psPosq4->_pSysStream[0][atom1.w];            
-            A->v0.x             = atomA.x - atomB.x;
-            A->v0.y             = atomA.y - atomB.y;
-            A->v0.z             = atomA.z - atomB.z;
-            A->v1.x             = atomC.x - atomB.x;
-            A->v1.y             = atomC.y - atomB.y;
-            A->v1.z             = atomC.z - atomB.z;
-            A->v2.x             = atomC.x - atomD.x;
-            A->v2.y             = atomC.y - atomD.y;
-            A->v2.z             = atomC.z - atomD.z; 
-            float3 cp0, cp1;
-            float dihedralAngle;
-            GETDIHEDRALANGLEBETWEENTHREEVECTORS(A->v0, A->v1, A->v2, A->v0, cp0, cp1, dihedralAngle);
-            float4 dihedral         = gpu->psDihedralParameter->_pSysStream[0][pos1];
-            float deltaAngle        = dihedral.z * dihedralAngle - (dihedral.y * 3.14159265f / 180.0f);
-            float sinDeltaAngle     = sin(deltaAngle);
-            float dEdAngle          = -dihedral.x * dihedral.z * sinDeltaAngle;
-            float normCross1        = DOT3(cp0, cp0);
-            float normBC            = sqrt(DOT3(A->v1, A->v1));
-            float4 ff;
-            ff.x                    = (-dEdAngle * normBC) / normCross1;
-            float normCross2        = DOT3(cp1, cp1);
-            ff.w                    = (dEdAngle * normBC) / normCross2;
-            float dp                = 1.0f / DOT3(A->v1, A->v1);
-            ff.y                    = DOT3(A->v0, A->v1) * dp;
-            ff.z                    = DOT3(A->v2, A->v1) * dp;
-            int4  atom2             = gpu->psDihedralID2->_pSysStream[0][pos1];   
-            float3 internalF0;
-            float3 internalF3;
-            float3 s;
-//            printf("%4d: %9.4f %9.4f %9.4f %9.4f\n", pos1, ff.x, ff.y, ff.z, ff.w);  
-            unsigned int offset                 = atom1.x + atom2.x * gpu->sim.stride;
-            float4 force                        = gpu->psForce4->_pSysStream[0][offset];
-            internalF0.x                        = ff.x * cp0.x; 
-            force.x                            += internalF0.x;
-            internalF0.y                        = ff.x * cp0.y;
-            force.y                            += internalF0.y;
-            internalF0.z                        = ff.x * cp0.z;       
-            force.z                            += internalF0.z;
-            gpu->psForce4->_pSysStream[0][offset]                = force;
-            printf("Dihedral %4d - 0: %9.4f %9.4f %9.4f\n", pos1, gpu->psForce4->_pSysStream[0][offset], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride2]);
-            offset                              = atom1.w + atom2.w * gpu->sim.stride;
-            force                               = gpu->psForce4->_pSysStream[0][offset];
-            internalF3.x                        = ff.w * cp1.x;
-            force.x                            += internalF3.x;
-            internalF3.y                        = ff.w * cp1.y;
-            force.y                            += internalF3.y;
-            internalF3.z                        = ff.w * cp1.z;
-            force.z                            += internalF3.z;
-            gpu->psForce4->_pSysStream[0][offset]                = force;
-            printf("Dihedral %4d - 3: %9.4f %9.4f %9.4f\n", pos1, gpu->psForce4->_pSysStream[0][offset], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride2]);
-            s.x                                 = ff.y * internalF0.x - ff.z * internalF3.x;   
-            s.y                                 = ff.y * internalF0.y - ff.z * internalF3.y;  
-            s.z                                 = ff.y * internalF0.z - ff.z * internalF3.z;        
-            offset                              = atom1.y + atom2.y * gpu->sim.stride;
-            force                               = gpu->psForce4->_pSysStream[0][offset];
-            force.x                            += -internalF0.x + s.x;
-            force.y                            += -internalF0.y + s.y;
-            force.z                            += -internalF0.z + s.z;
-            gpu->psForce4->_pSysStream[0][offset]                = force;
-            printf("Dihedral %4d - 1: %9.4f %9.4f %9.4f\n", pos1, gpu->psForce4->_pSysStream[0][offset], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride2]);
-            offset                              = atom1.z + atom2.z * gpu->sim.stride;
-            force                               = gpu->psForce4->_pSysStream[0][offset];
-            force.x                            += -internalF3.x - s.x;
-            force.y                            += -internalF3.y - s.y;
-            force.z                            += -internalF3.z - s.z;
-            gpu->psForce4->_pSysStream[0][offset]                = force;
-            printf("Dihedral %4d - 2: %9.4f %9.4f %9.4f\n", pos1, gpu->psForce4->_pSysStream[0][offset], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride2]);
-        }        
-        pos++;
-    }
-    while (pos < gpu->sim.rb_dihedral_offset)
-    {
-        unsigned int pos1 = pos - gpu->sim.dihedral_offset;
-        if (pos1 < gpu->sim.rb_dihedrals)
-        {
-            int4   atom1        = gpu->psRbDihedralID1->_pSysStream[0][pos1];  
-            float4 atomA        = gpu->psPosq4->_pSysStream[0][atom1.x];
-            float4 atomB        = gpu->psPosq4->_pSysStream[0][atom1.y];
-            float4 atomC        = gpu->psPosq4->_pSysStream[0][atom1.z];
-            float4 atomD        = gpu->psPosq4->_pSysStream[0][atom1.w];            
-            A->v0.x             = atomA.x - atomB.x;
-            A->v0.y             = atomA.y - atomB.y;
-            A->v0.z             = atomA.z - atomB.z;
-            A->v1.x             = atomC.x - atomB.x;
-            A->v1.y             = atomC.y - atomB.y;
-            A->v1.z             = atomC.z - atomB.z;
-            A->v2.x             = atomC.x - atomD.x;
-            A->v2.y             = atomC.y - atomD.y;
-            A->v2.z             = atomC.z - atomD.z; 
-            float3 cp0, cp1;
-            float dihedralAngle, cosPhi;
-      //      printf("%4d - 0 : %9.4f %9.4f %9.4f\n", pos1, A->v0.x, A->v0.y, A->v0.z); 
-      //      printf("%4d - 1 : %9.4f %9.4f %9.4f\n", pos1, A->v1.x, A->v1.y, A->v1.z); 
-      //      printf("%4d - 2 : %9.4f %9.4f %9.4f\n", pos1, A->v2.x, A->v2.y, A->v2.z);  
-            GETDIHEDRALANGLECOSINEBETWEENTHREEVECTORS(A->v0, A->v1, A->v2, A->v0, cp0, cp1, dihedralAngle, cosPhi);
-            if (dihedralAngle < 0.0f )
-            {
-                dihedralAngle += 3.14159265f;
-            } 
-            else 
-            {
-                dihedralAngle -= 3.14159265f;
-            }
-            cosPhi                  = -cosPhi;
-         //   printf("%4d: %9.4f %9.4f\n", pos1, dihedralAngle, cosPhi);
-            float4 dihedral1        = gpu->psRbDihedralParameter1->_pSysStream[0][pos1];
-            float2 dihedral2        = gpu->psRbDihedralParameter2->_pSysStream[0][pos1];
-            float cosFactor         = cosPhi;
-            float dEdAngle          = -dihedral1.y;
-        //    printf("%4d - 1: %9.4f %9.4f\n", pos1, dEdAngle, 1.0f);
-            dEdAngle               -= 2.0f * dihedral1.z * cosFactor;
-       //     printf("%4d - 2: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
-            cosFactor              *= cosPhi;
-            dEdAngle               -= 3.0f * dihedral1.w * cosFactor;
-     //       printf("%4d - 3: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
-            cosFactor              *= cosPhi;
-            dEdAngle               -= 4.0f * dihedral2.x * cosFactor;
-   //         printf("%4d - 4: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
-            cosFactor              *= cosPhi;
-            dEdAngle               -= 5.0f * dihedral2.y * cosFactor;
- //           printf("%4d - 5: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
-            dEdAngle               *= sin(dihedralAngle);  
-//            printf("%4d - f: %9.4f\n", pos1, dEdAngle);
-            float normCross1        = DOT3(cp0, cp0);
-            float normBC            = sqrt(DOT3(A->v1, A->v1));
-            float4 ff;
-            ff.x                    = (-dEdAngle * normBC) / normCross1;
-            float normCross2        = DOT3(cp1, cp1);
-            ff.w                    = (dEdAngle * normBC) / normCross2;
-            float dp                = 1.0f / DOT3(A->v1, A->v1);
-            ff.y                    = DOT3(A->v0, A->v1) * dp;
-            ff.z                    = DOT3(A->v2, A->v1) * dp;
-            int4  atom2             = gpu->psRbDihedralID2->_pSysStream[0][pos1];   
-            float3 internalF0;
-            float3 internalF3;
-            float3 s;
-            printf("RB Dihedral %4d: %9.4f %9.4f %9.4f %9.4f\n", pos1, ff.x, ff.y, ff.z, ff.w);  
-            unsigned int offset                 = atom1.x + atom2.x * gpu->sim.stride;
-            float4 force                        = gpu->psForce4->_pSysStream[0][offset];
-            internalF0.x                        = ff.x * cp0.x; 
-            force.x                            += internalF0.x;
-            internalF0.y                        = ff.x * cp0.y;
-            force.y                            += internalF0.y;
-            internalF0.z                        = ff.x * cp0.z;       
-            force.z                            += internalF0.z;
-            gpu->psForce4->_pSysStream[0][offset]                = force;
-            printf("RB Dihedral %4d - 0: %9.4f %9.4f %9.4f\n", pos1, gpu->psForce4->_pSysStream[0][offset], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride2]);
-            offset                              = atom1.w + atom2.w * gpu->sim.stride;
-            force                               = gpu->psForce4->_pSysStream[0][offset];
-            internalF3.x                        = ff.w * cp1.x;
-            force.x                            += internalF3.x;
-            internalF3.y                        = ff.w * cp1.y;
-            force.y                            += internalF3.y;
-            internalF3.z                        = ff.w * cp1.z;
-            force.z                            += internalF3.z;
-            gpu->psForce4->_pSysStream[0][offset]                = force;
-            printf("RB Dihedral %4d - 3: %9.4f %9.4f %9.4f\n", pos1, gpu->psForce4->_pSysStream[0][offset], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride2]);
-            s.x                                 = ff.y * internalF0.x - ff.z * internalF3.x;   
-            s.y                                 = ff.y * internalF0.y - ff.z * internalF3.y;  
-            s.z                                 = ff.y * internalF0.z - ff.z * internalF3.z;        
-            offset                              = atom1.y + atom2.y * gpu->sim.stride;
-            force                               = gpu->psForce4->_pSysStream[0][offset];
-            force.x                            += -internalF0.x + s.x;
-            force.y                            += -internalF0.y + s.y;
-            force.z                            += -internalF0.z + s.z;
-            gpu->psForce4->_pSysStream[0][offset]                = force;
-            printf("RB Dihedral %4d - 1: %9.4f %9.4f %9.4f\n", pos1, gpu->psForce4->_pSysStream[0][offset], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride2]);
-            offset                              = atom1.z + atom2.z * gpu->sim.stride;
-            force                               = gpu->psForce4->_pSysStream[0][offset];
-            force.x                            += -internalF3.x - s.x;
-            force.y                            += -internalF3.y - s.y;
-            force.z                            += -internalF3.z - s.z;
-            gpu->psForce4->_pSysStream[0][offset]                = force;
-     //       printf("%4d - 2: %9.4f %9.4f %9.4f\n", pos1, gpu->psForce4->_pSysStream[0][offset], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride2]);
-        }            
-        pos++;
-    }   
-    while (pos < gpu->sim.LJ14_offset)
-    {  
-        unsigned int pos1       = pos - gpu->sim.rb_dihedral_offset;
-        if (pos1 < gpu->sim.LJ14s)
-        {
-            int4 atom               = gpu->psLJ14ID->_pSysStream[0][pos1];
-            float4 LJ14             = gpu->psLJ14Parameter->_pSysStream[0][pos1];
-            float4 a1               = gpu->psPosq4->_pSysStream[0][atom.x];
-            float4 a2               = gpu->psPosq4->_pSysStream[0][atom.y];
-            float3 d;
-            d.x                     = a1.x - a2.x;
-            d.y                     = a1.y - a2.y;
-            d.z                     = a1.z - a2.z;
-            float r2                = DOT3(d, d);
-            float inverseR          = 1.0f / sqrt(r2);
-            float sig2              = inverseR * LJ14.y;
-            sig2                   *= sig2;
-            float sig6              = sig2 * sig2 * sig2;
-            float dEdR              = LJ14.x * (12.0f * sig6 - 6.0f) * sig6;
-            dEdR                   += LJ14.z * inverseR;
-            dEdR                   *= inverseR * inverseR;
-            unsigned int offsetA    = atom.x + atom.z * gpu->sim.stride;
-            unsigned int offsetB    = atom.y + atom.w * gpu->sim.stride;
-            float4 forceA           = gpu->psForce4->_pSysStream[0][offsetA];
-            float4 forceB           = gpu->psForce4->_pSysStream[0][offsetB];
-            d.x                    *= dEdR;
-            d.y                    *= dEdR;
-            d.z                    *= dEdR;
-            forceA.x               += d.x;
-            forceA.y               += d.y;
-            forceA.z               += d.z;
-            forceB.x               -= d.x;
-            forceB.y               -= d.y;
-            forceB.z               -= d.z;        
-            printf("LJ14 %d: %11.4f %11.4f %11.4f\n", pos1, d.x, d.y, d.z);
-            gpu->psForce4->_pSysStream[0][offsetA]   = forceA;
-            gpu->psForce4->_pSysStream[0][offsetB]   = forceB;
-        }        
-        pos++;
-    }
-#endif
-    if (violations > 0)
-    {
-        gpuDumpCoordinates(gpu);
-        gpuDumpForces(gpu);
-    }
-}
-static FILE* getWriteToFilePtr( char* fname, int step )
-{
-   std::stringstream fileName;
-   fileName << fname << "_";
-   fileName << step;
-   fileName << ".txt";
-   FILE* filePtr = fopen( fileName.str().c_str(), "w" );
-   if( filePtr == NULL ){
-      (void) fprintf( stderr, "Could not open file=<%s> for writitng.", fileName.str().c_str() );
-      exit(-1);
-   }
-   return filePtr;
-}
-extern "C" {
-static void printValues( FILE* filePtr, int index, int numberOfValues, float* values )
-{
-   int i;
-   (void) fprintf( filePtr, "%5d ", index );
-   for ( i = 0; i < numberOfValues; i++ ) { 
-      (void) fprintf( filePtr, " %18.10e", values[i] );
-   }
-   (void) fprintf( filePtr, "\n" );
-   (void) fflush( filePtr );
-} 
-}
-extern "C"
-void WriteArrayToFile1( gpuContext gpu, char* fname, int step, CUDAStream<float>* psPos, int numPrint )
-{
-   int i;
-   static const int numberOfValues = 1;
-   FILE* filePtr = getWriteToFilePtr( fname, step );
-   float values[numberOfValues];
-   psPos->Download();
-   numPrint = (numPrint > 0 && (numPrint < gpu->natoms)) ? numPrint : gpu->natoms;
-   for ( i = 0; i < numPrint; i++ ) { 
-      values[0] = psPos->_pSysStream[0][i];
-      printValues( filePtr, i, numberOfValues, values ); 
-   }
-   for ( i = gpu->natoms - numPrint; i < gpu->natoms; i++ ) { 
-      values[0] = psPos->_pSysStream[0][i];
-      printValues( filePtr, i, numberOfValues, values ); 
-   }
-   (void) fclose( filePtr );
-}
-extern "C"
-void WriteArrayToFile2( gpuContext gpu, char* fname, int step, CUDAStream<float2>* psPos, int numPrint )
-{
-   int i;
-   static const int numberOfValues = 2;
-   FILE* filePtr = getWriteToFilePtr( fname, step );
-   float values[numberOfValues];
-   psPos->Download();
-   numPrint = (numPrint > 0 && (numPrint < gpu->natoms)) ? numPrint : gpu->natoms;
-   for ( i = 0; i < numPrint; i++ ) { 
-      values[0] = psPos->_pSysStream[0][i].x;
-      values[1] = psPos->_pSysStream[0][i].y;
-      printValues( filePtr, i, numberOfValues, values ); 
-   }
-   for ( i = gpu->natoms - numPrint; i < gpu->natoms; i++ ) { 
-      values[0] = psPos->_pSysStream[0][i].x;
-      values[1] = psPos->_pSysStream[0][i].y;
-      printValues( filePtr, i, numberOfValues, values ); 
-   }
-   (void) fclose( filePtr );
-}
-extern "C"
-void WriteArrayToFile4( gpuContext gpu, char* fname, int step, CUDAStream<float4>* psPos, int numPrint )
-{
-   int i;
-   static const int numberOfValues = 4;
-   FILE* filePtr = getWriteToFilePtr( fname, step );
-   float values[numberOfValues];
-   psPos->Download();
-   numPrint = (numPrint > 0 && (numPrint < gpu->natoms)) ? numPrint : gpu->natoms;
-   for ( i = 0; i < numPrint; i++ ) { 
-      values[0] = psPos->_pSysStream[0][i].x;
-      values[1] = psPos->_pSysStream[0][i].y;
-      values[2] = psPos->_pSysStream[0][i].z;
-      values[3] = psPos->_pSysStream[0][i].w;
-      printValues( filePtr, i, numberOfValues, values ); 
-   }
-   for ( i = gpu->natoms - numPrint; i < gpu->natoms; i++ ) { 
-      values[0] = psPos->_pSysStream[0][i].x;
-      values[1] = psPos->_pSysStream[0][i].y;
-      values[2] = psPos->_pSysStream[0][i].z;
-      values[3] = psPos->_pSysStream[0][i].w;
-      printValues( filePtr, i, numberOfValues, values ); 
-   }
-   (void) fclose( filePtr );
-}
-extern "C"
-void gpuDumpObcInfo(gpuContext gpu)
-{
-    gpu->psPosq4->Download();
-    gpu->psBornRadii->Download();
-    gpu->psObcData->Download();
-    gpu->psBornSum->Download();
-    printf( "\n\nObc Info xyzw Brad atomR scaledAtomR\n" );
-    for (int i = 0; i < gpu->natoms; i++)
-    {
-        printf("%4d: %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f\n", i, 
-            gpu->psPosq4->_pSysStream[0][i].x,
-            gpu->psPosq4->_pSysStream[0][i].y,
-            gpu->psPosq4->_pSysStream[0][i].z,
-            gpu->psPosq4->_pSysStream[0][i].w,
-            gpu->psBornRadii->_pSysStream[0][i],
-            gpu->psBornSum->_pSysStream[0][i],
-            gpu->psObcData->_pSysStream[0][i].x,
-            gpu->psObcData->_pSysStream[0][i].y
-        );
-    }
-}
-extern "C"
-void gpuDumpObcLoop1(gpuContext gpu)
-{
-    float compF;
-    gpu->psForce4->Download();
-    gpu->psBornRadii->Download();
-    gpu->psBornForce->Download();
-    gpu->psObcChain->Download();
-    gpu->psBornSum->Download();
-    printf( "\n\nObc F3 BrnR BrnF Chn\n" );
-    for (int i = 0; i < gpu->natoms; i++)
-    {
-	     compF = gpu->psBornForce->_pSysStream[0][i]/(gpu->psBornRadii->_pSysStream[0][i]*gpu->psBornRadii->_pSysStream[0][i]*gpu->psObcChain->_pSysStream[0][i]);
-        printf("%4d: %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f\n", i, 
-            gpu->psForce4->_pSysStream[0][i].x,
-            gpu->psForce4->_pSysStream[0][i].y,
-            gpu->psForce4->_pSysStream[0][i].z,
-//            gpu->psForce4->_pSysStream[0][i].w,
-            gpu->psBornRadii->_pSysStream[0][i],
-				compF,
-            gpu->psBornForce->_pSysStream[0][i],
-//            gpu->psBornSum->_pSysStream[0][i],
-            gpu->psObcChain->_pSysStream[0][i]
-        );
-    }
-}
 static void tagAtomsInMolecule(int atom, int molecule, vector<int>& atomMolecule, vector<vector<int> >& atomBonds)
 {
    // Recursively tag atoms as belonging to a particular molecule.

--- a/platforms/cuda/src/kernels/gputypes.h
+++ b/platforms/cuda/src/kernels/gputypes.h
@@ -138,52 +138,25 @@ typedef struct _gpuContext *gpuContext;
 extern "C"
 bool gpuIsAvailable();
-extern "C"
-int gpuReadBondParameters(gpuContext gpu, char* fname);
 extern "C"
 void gpuSetBondParameters(gpuContext gpu, const std::vector<int>& atom1, const std::vector<int>& atom2, const std::vector<float>& length, const std::vector<float>& k);
-extern "C"
-int gpuReadBondAngleParameters(gpuContext gpu, char* fname);
 extern "C"
 void gpuSetBondAngleParameters(gpuContext gpu, const std::vector<int>& atom1, const std::vector<int>& atom2, const std::vector<int>& atom3,
        const std::vector<float>& angle, const std::vector<float>& k);
-extern "C"
-int gpuReadDihedralParameters(gpuContext gpu, char* fname);
 extern "C"
 void gpuSetDihedralParameters(gpuContext gpu, const std::vector<int>& atom1, const std::vector<int>& atom2, const std::vector<int>& atom3, const std::vector<int>& atom4,
        const std::vector<float>& k, const std::vector<float>& phase, const std::vector<int>& periodicity);
-extern "C"
-int gpuReadRbDihedralParameters(gpuContext gpu, char* fname);
 extern "C"
 void gpuSetRbDihedralParameters(gpuContext gpu, const std::vector<int>& atom1, const std::vector<int>& atom2, const std::vector<int>& atom3, const std::vector<int>& atom4,
        const std::vector<float>& c0, const std::vector<float>& c1, const std::vector<float>& c2, const std::vector<float>& c3, const std::vector<float>& c4, const std::vector<float>& c5);
-extern "C"
-int gpuReadLJ14Parameters(gpuContext gpu, char* fname);
 extern "C"
 void gpuSetLJ14Parameters(gpuContext gpu, float epsfac, float fudge, const std::vector<int>& atom1, const std::vector<int>& atom2,
        const std::vector<float>& c6, const std::vector<float>& c12, const std::vector<float>& q1, const std::vector<float>& q2);
-extern "C"
-float gpuGetAtomicRadius(gpuContext gpu, std::string s);
-extern "C"
-unsigned char gpuGetAtomicSymbol(gpuContext gpu, std::string s);
-extern "C"
-int gpuReadAtomicParameters(gpuContext gpu, char* fname);
-extern "C"
-int gpuReadCoulombParameters(gpuContext gpu, char* fname);
 extern "C"
 void gpuSetCoulombParameters(gpuContext gpu, float epsfac, const std::vector<int>& atom, const std::vector<float>& c6, const std::vector<float>& c12, const std::vector<float>& q,
        const std::vector<char>& symbol, const std::vector<std::vector<int> >& exclusions, CudaNonbondedMethod method);
@@ -197,9 +170,6 @@ void gpuSetPeriodicBoxSize(gpuContext gpu, float xsize, float ysize, float zsize
 extern "C"
 void gpuSetObcParameters(gpuContext gpu, float innerDielectric, float solventDielectric, const std::vector<int>& atom, const std::vector<float>& radius, const std::vector<float>& scale);
-extern "C"
-int gpuReadShakeParameters(gpuContext gpu, char* fname);
 extern "C"
 void gpuSetShakeParameters(gpuContext gpu, const std::vector<int>& atom1, const std::vector<int>& atom2, const std::vector<float>& distance,
        const std::vector<float>& invMass1, const std::vector<float>& invMass2, float tolerance);
@@ -207,9 +177,6 @@ void gpuSetShakeParameters(gpuContext gpu, const std::vector<int>& atom1, const
 extern "C"
 int gpuAllocateInitialBuffers(gpuContext gpu);
-extern "C"
-void gpuReadCoordinates(gpuContext gpu, char* fname);
 extern "C"
 void gpuSetPositions(gpuContext gpu, const std::vector<float>& x, const std::vector<float>& y, const std::vector<float>& z);
@@ -222,9 +189,6 @@ void gpuSetMass(gpuContext gpu, const std::vector<float>& mass);
 extern "C"
 void gpuInitializeRandoms(gpuContext gpu);
-extern "C"
-void* gpuInitFromFile(char* fname);
 extern "C"
 void* gpuInit(int numAtoms);
@@ -255,48 +219,6 @@ void gpuBuildExclusionList(gpuContext gpu);
 extern "C"
 int gpuSetConstants(gpuContext gpu);
-extern "C"
-void gpuDumpCoordinates(gpuContext gpu);
-extern "C"
-void gpuDumpPrimeCoordinates(gpuContext gpu);
-extern "C"
-void gpuDumpForces(gpuContext gpu);
-extern "C"
-void gpuDumpAtomData(gpuContext gpu);
-extern "C"
-bool gpuCheckData(gpuContext gpu);
-extern "C"
-void gpuSetup(void* pVoid);
-extern "C"
-void kCPUCalculate14(gpuContext gpu);
-extern "C"
-void kCPUCalculateLocalForces(gpuContext gpu);
-extern "C"
-void WriteArrayToFile1( gpuContext gpu, char* fname, int step, CUDAStream<float>*  psPos, int numPrint );
-extern "C"
-void WriteArrayToFile2( gpuContext gpu, char* fname, int step, CUDAStream<float2>* psPos, int numPrint );
-extern "C"
-void WriteArrayToFile3( gpuContext gpu, char* fname, int step, CUDAStream<float3>* psPos, int numPrint );
-extern "C"
-void WriteArrayToFile4( gpuContext gpu, char* fname, int step, CUDAStream<float4>* psPos, int numPrint );
-extern "C"
-void gpuDumpObcInfo(gpuContext gpu);
-extern "C"
-void gpuDumpObcLoop1(gpuContext gpu); 
 extern "C"
 void gpuReorderAtoms(gpuContext gpu);

--- a/platforms/cuda/src/kernels/kCalculateObcGbsaBornSum.cu
+++ b/platforms/cuda/src/kernels/kCalculateObcGbsaBornSum.cu
@@ -152,18 +152,6 @@ void kReduceObcGbsaBornSum(gpuContext gpu)
 //    printf("kReduceObcGbsaBornSum\n");
    kReduceObcGbsaBornSum_kernel<<<gpu->sim.blocks, 384>>>();
    gpu->bRecalculateBornRadii = false;
-if( 0 ){
-   static int step = 0;
-   int numPrint    = -1;
-   step++;
-   WriteArrayToFile1( gpu, "ObcGbsaBornBRad", step, gpu->psBornRadii, numPrint );
-   WriteArrayToFile1( gpu, "ObcGbsaBornSum", step, gpu->psBornSum, numPrint );
-   WriteArrayToFile2( gpu, "ObcGbsaObcData", step, gpu->psObcData, numPrint );
-   WriteArrayToFile4( gpu, "ObcGbsaBornPos", step, gpu->psPosq4, numPrint );
-   //gpuDumpCoordinates( gpu );
-   gpuDumpObcInfo( gpu );
-}
    LAUNCHERROR("kReduceObcGbsaBornSum");
 }

--- a/platforms/cuda/src/kernels/kCalculateObcGbsaForces1.cu
+++ b/platforms/cuda/src/kernels/kCalculateObcGbsaForces1.cu
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * Permission is hereby granted, free of charge, to any person obtaining a    *
- * copy of this software and associated documentation files (the "Software"), *
- * to deal in the Software without restriction, including without limitation  *
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
- * and/or sell copies of the Software, and to permit persons to whom the      *
- * Software is furnished to do so, subject to the following conditions:       *
- *                                                                            *
- * The above copyright notice and this permission notice shall be included in *
- * all copies or substantial portions of the Software.                        *
- *                                                                            *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
- * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
- * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
- * -------------------------------------------------------------------------- */
-#include <stdio.h>
-#include <cuda.h>
-#include <vector_functions.h>
-#include <cstdlib>
-#include <string>
-#include <iostream>
-#include <fstream>
-using namespace std;
-#include "gputypes.h"
-struct Atom {
-    float x;
-    float y;
-    float z;
-    float q;
-    float br;
-    float fx;
-    float fy;
-    float fz;
-    float fb;
-};
-__shared__ Atom sA[G8X_NONBOND_THREADS_PER_BLOCK];
-__shared__ unsigned int sWorkUnit[G8X_NONBOND_WORKUNITS_PER_SM];
-__shared__ unsigned int sNext[GRID];
-static __constant__ cudaGmxSimulation cSim;
-void SetCalculateObcGbsaForces1Sim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
-    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
-}
-void GetCalculateObcGbsaForces1Sim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
-    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
-}
-__global__ void kReduceObcGbsaBornForces_kernel()
-{
-    unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
-    while (pos < cSim.atoms)
-    {
-        float bornRadius = cSim.pBornRadii[pos];
-        float obcChain   = cSim.pObcChain[pos];
-        float2 obcData   = cSim.pObcData[pos];
-        float totalForce = 0.0f;
-        float* pFt = cSim.pBornForce + pos;
-        int i = cSim.nonbondOutputBuffers;
-        while (i >= 4)
-        {
-            float f1    = *pFt;
-            pFt        += cSim.stride;
-            float f2    = *pFt;
-            pFt        += cSim.stride;
-            float f3    = *pFt;
-            pFt        += cSim.stride;
-            float f4    = *pFt;
-            pFt        += cSim.stride;
-            totalForce += f1 + f2 + f3 + f4;
-            i -= 4;
-        }
-        if (i >= 2)
-        {
-            float f1    = *pFt;
-            pFt        += cSim.stride;
-            float f2    = *pFt;
-            pFt        += cSim.stride;
-            totalForce += f1 + f2;
-            i -= 2;
-        }
-        if (i > 0)
-        {
-            totalForce += *pFt;
-        }
-// __syncthreads();       
-        //printf("%4d: %9.4f %9.4f %9.4f\n", pos, totalForce, bornRadius, obcChain);
-//totalForce = 0.0f;        
-//        if (bornRadius > 0.0f)
-//        {
-            float r            = (obcData.x + cSim.dielectricOffset + cSim.probeRadius);
-            float ratio6       = pow((obcData.x + cSim.dielectricOffset) / bornRadius, 6.0f);
-            //float saTerm       = cSim.surfaceAreaFactor * r * r * ratio6;
-            float saTerm       = cSim.surfaceAreaFactor * r * r * ratio6;
-            totalForce        += saTerm / bornRadius; // 1.102 == Temp mysterious fudge factor, FIX FIX FIX
-//        }
-        totalForce *= bornRadius * bornRadius * obcChain;
-        pFt = cSim.pBornForce + pos;
-        *pFt = totalForce;
-        pos += gridDim.x * blockDim.x;
-    }   
-}
-__global__ void kReduceObcGbsaBornForces1_kernel()
-{
-    unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
-    while (pos < cSim.atoms)
-    {
-        float bornRadius = cSim.pBornRadii[pos];
-        float obcChain   = cSim.pObcChain[pos];
-        //float2 obcData   = cSim.pObcData[pos];
-        float totalForce = 0.0f;
-        float* pFt = cSim.pBornForce + pos;
-        int i = cSim.nonbondOutputBuffers;
-        while (i >= 4)
-        {
-            float f1    = *pFt;
-            pFt        += cSim.stride;
-            float f2    = *pFt;
-            pFt        += cSim.stride;
-            float f3    = *pFt;
-            pFt        += cSim.stride;
-            float f4    = *pFt;
-            pFt        += cSim.stride;
-            totalForce += f1 + f2 + f3 + f4;
-            i -= 4;
-        }
-        if (i >= 2)
-        {
-            float f1    = *pFt;
-            pFt        += cSim.stride;
-            float f2    = *pFt;
-            pFt        += cSim.stride;
-            totalForce += f1 + f2;
-            i -= 2;
-        }
-        if (i > 0)
-        {
-            totalForce += *pFt;
-        }
-// __syncthreads();       
-        //printf("%4d: %9.4f %9.4f %9.4f\n", pos, totalForce, bornRadius, obcChain);
-//totalForce = 0.0f;        
-/*
-//        if (bornRadius > 0.0f)
-//        {
-            float r            = (obcData.x + cSim.dielectricOffset + cSim.probeRadius);
-            float ratio6       = pow((obcData.x + cSim.dielectricOffset) / bornRadius, 6.0f);
-            float saTerm       = cSim.surfaceAreaFactor * r * r * ratio6;
-            totalForce        += saTerm / bornRadius; // 1.102 == Temp mysterious fudge factor, FIX FIX FIX
-//        }
-		  */
-        totalForce *= bornRadius * bornRadius * obcChain;
-        cSim.pBornForce[pos] = totalForce;
-        pos += gridDim.x * blockDim.x;
-    }   
-}
-__global__ void kAceGbsa_kernel()
-{
-    unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
-    while (pos < cSim.atoms)
-    {
-        float bornRadius = cSim.pBornRadii[pos];
-        float obcChain   = cSim.pObcChain[pos];
-        float2 obcData   = cSim.pObcData[pos];
-        float totalForce = cSim.pBornForce[pos];
-        //float totalForce = 0.0f;
-        float r            = (obcData.x + cSim.dielectricOffset + cSim.probeRadius);
-        float ratio6       = pow((obcData.x + cSim.dielectricOffset) / bornRadius, 6.0f);
-		  /*
-        float ratio6       = (obcData.x + cSim.dielectricOffset) / bornRadius;
-		        ratio6       = ratio6*ratio6;
-		        ratio6       = ratio6*ratio6*ratio6;
-	*/	
-        //float saTerm       = 41.84f*cSim.surfaceAreaFactor * r * r * ratio6;
-        float saTerm       = cSim.surfaceAreaFactor * r * r * ratio6;
-        totalForce        += saTerm / bornRadius; // 1.102 == Temp mysterious fudge factor, FIX FIX FIX
-        totalForce        *= bornRadius * bornRadius * obcChain;
-        cSim.pBornForce[pos] = totalForce;
-        pos += gridDim.x * blockDim.x;
-    }   
-}
-void kReduceObcGbsaBornForces(gpuContext gpu)
-{
-    //printf("kReduceObcGbsaBornForces QQ\n");
-    kReduceObcGbsaBornForces_kernel<<<gpu->sim.blocks, gpu->sim.bf_reduce_threads_per_block>>>();
-    //kReduceObcGbsaBornForces1_kernel<<<gpu->sim.blocks, gpu->sim.bf_reduce_threads_per_block>>>();
-    //kAceGbsa_kernel<<<gpu->sim.blocks, gpu->sim.bf_reduce_threads_per_block>>>();
-    //printf("kReduceObcGbsaBornForces calling gpuDumpObcLoop1 QQ\n");
-	 //gpuDumpObcLoop1(gpu);
-}
-__global__ void kCalculateObcGbsaForces1_kernel()
-{
-    // Read queue of work blocks once so the remainder of
-    // kernel can run asynchronously    
-    int pos = cSim.nbWorkUnitsPerBlock * blockIdx.x + min(blockIdx.x, cSim.nbWorkUnitsPerBlockRemainder);
-    int end = cSim.nbWorkUnitsPerBlock * (blockIdx.x + 1) + min((blockIdx.x + 1), cSim.nbWorkUnitsPerBlockRemainder);    
-    if (threadIdx.x < end - pos)
-    {
-        sWorkUnit[threadIdx.x] = cSim.pWorkUnit[pos + threadIdx.x];
-    }
-    if (threadIdx.x < GRID)
-    {
-        sNext[threadIdx.x] = (threadIdx.x + 1) & (GRID - 1);
-    }
-    __syncthreads();
-    // Now change pos and end to reflect work queue just read
-    // into shared memory
-    end = end - pos; 
-    pos = end - (threadIdx.x >> GRIDBITS) - 1;
-    while (pos >= 0)
-    {  
-        // Extract cell coordinates from appropriate work unit
-        unsigned int x = sWorkUnit[pos];
-        unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
-        x = (x >> 17) << GRIDBITS;
-        float4      apos;   // Local atom x, y, z, q
-        float4      af;     // Local atom fx, fy, fz, fb
-        unsigned int tgx = threadIdx.x & (GRID - 1);
-        unsigned int tbx = threadIdx.x - tgx;
-        int tj = tgx; 
-        Atom* psA = &sA[tbx];
-        if (x == y) // Handle diagonals uniquely at 50% efficiency
-        { 
-            // Read fixed atom data into registers and GRF
-            unsigned int i          = x + tgx;
-            apos                    = cSim.pPosq[i];
-            float br                = cSim.pBornRadii[i];
-            sA[threadIdx.x].x       = apos.x;
-            sA[threadIdx.x].y       = apos.y;
-            sA[threadIdx.x].z       = apos.z;
-            sA[threadIdx.x].q       = apos.w;
-            sA[threadIdx.x].br      = br;
-            af.x                    = 0.0f;
-            af.y                    = 0.0f;
-            af.z                    = 0.0f;
-            af.w                    = 0.0f;
-            apos.w                 *= cSim.preFactor;
-            for (unsigned int j = 0; j < GRID; j++)
-            {
-                float dx                = psA[j].x - apos.x; 
-                float dy                = psA[j].y - apos.y; 
-                float dz                = psA[j].z - apos.z; 
-                float r2                = dx * dx + dy * dy + dz * dz; 
-                float alpha2_ij         = br * psA[j].br; 
-                float D_ij              = r2 / (4.0f * alpha2_ij); 
-                float expTerm           = exp(-D_ij); 
-                float denominator2      = r2 + alpha2_ij * expTerm; 
-                float denominator       = sqrt(denominator2); 
-                float Gpol              = (apos.w * psA[j].q) / (denominator * denominator2); 
-                float dGpol_dr          = Gpol * (1.0f - 0.25f * expTerm); 
-                float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij); 
-                dx                     *= dGpol_dr; 
-                dy                     *= dGpol_dr; 
-                dz                     *= dGpol_dr; 
-                af.x                   -= dx; 
-                af.y                   -= dy; 
-                af.z                   -= dz; 
-                af.w                   += dGpol_dalpha2_ij * psA[j].br;      
-            }
-            // Write results
-            int offset                          = x + tgx + (x >> GRIDBITS) * cSim.stride;
-            cSim.pForce4a[offset]               = af;
-            cSim.pBornForce[offset]             = af.w;
-        }         
-        else        // 100% utilization
-        {
-            // Read fixed atom data into registers and GRF
-            int j                   = y + tgx;
-            unsigned int i          = x + tgx;
-            float4 temp             = cSim.pPosq[j];
-            float temp1             = cSim.pBornRadii[j];
-            apos                    = cSim.pPosq[i];
-            float br                = cSim.pBornRadii[i];
-            sA[threadIdx.x].x       = temp.x;
-            sA[threadIdx.x].y       = temp.y;
-            sA[threadIdx.x].z       = temp.z;
-            sA[threadIdx.x].q       = temp.w;
-            sA[threadIdx.x].br      = temp1;
-            sA[threadIdx.x].fx      = af.x = 0.0f;
-            sA[threadIdx.x].fy      = af.y = 0.0f;
-            sA[threadIdx.x].fz      = af.z = 0.0f;
-            sA[threadIdx.x].fb      = af.w = 0.0f;
-            apos.w                 *= cSim.preFactor;
-            for (j = 0; j < GRID; j++)
-            {   
-                float dx                = psA[tj].x - apos.x; 
-                float dy                = psA[tj].y - apos.y; 
-                float dz                = psA[tj].z - apos.z; 
-                float r2                = dx * dx + dy * dy + dz * dz; 
-                float alpha2_ij         = br * psA[tj].br; 
-                float D_ij              = r2 / (4.0f * alpha2_ij); 
-                float expTerm           = exp(-D_ij); 
-                float denominator2      = r2 + alpha2_ij * expTerm; 
-                float denominator       = sqrt(denominator2); 
-                float Gpol              = (apos.w * psA[tj].q) / (denominator * denominator2); 
-                float dGpol_dr          = Gpol * (1.0f - 0.25f * expTerm); 
-                float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij); 
-                dx                     *= dGpol_dr; 
-                dy                     *= dGpol_dr; 
-                dz                     *= dGpol_dr; 
-                af.x                   -= dx; 
-                af.y                   -= dy; 
-                af.z                   -= dz; 
-                psA[tj].fx             += dx; 
-                psA[tj].fy             += dy; 
-                psA[tj].fz             += dz; 
-                af.w                   += dGpol_dalpha2_ij * psA[tj].br; 
-                psA[tj].fb             += dGpol_dalpha2_ij * br;        
-                tj                      = sNext[tj]; 
-            }
-            // Write results
-            int offset                          = x + tgx + (y >> GRIDBITS) * cSim.stride;
-            cSim.pForce4a[offset]               = af;
-            cSim.pBornForce[offset]             = af.w;
-            offset                              = y + tgx + (x >> GRIDBITS) * cSim.stride;
-            af.x                                = sA[threadIdx.x].fx;
-            af.y                                = sA[threadIdx.x].fy;
-            af.z                                = sA[threadIdx.x].fz;
-            af.w                                = sA[threadIdx.x].fb;
-            offset                              = y + tgx + (x >> GRIDBITS) * cSim.stride;
-            cSim.pForce4a[offset]               = af;
-            cSim.pBornForce[offset]             = af.w;
-        }
-        pos -= cSim.nonbond_workBlock;     
-    }
-}
-__global__ extern void kCalculateObcGbsaForces1_12_kernel();
-void kCalculateObcGbsaForces1(gpuContext gpu)
-{
-    //printf("kCalculateObcGbsaForces1 version=%d sm_12=%d QQ\n", gpu->sm_version, SM_12);
-    if (gpu->sm_version < SM_12)
-        kCalculateObcGbsaForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
-    else
-        kCalculateObcGbsaForces1_12_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
-    LAUNCHERROR("kCalculateObcGbsaForce1");
-}
--- a/platforms/cuda/src/kernels/kCalculateObcGbsaForces1_12.cu
+++ b/platforms/cuda/src/kernels/kCalculateObcGbsaForces1_12.cu
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * Permission is hereby granted, free of charge, to any person obtaining a    *
- * copy of this software and associated documentation files (the "Software"), *
- * to deal in the Software without restriction, including without limitation  *
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
- * and/or sell copies of the Software, and to permit persons to whom the      *
- * Software is furnished to do so, subject to the following conditions:       *
- *                                                                            *
- * The above copyright notice and this permission notice shall be included in *
- * all copies or substantial portions of the Software.                        *
- *                                                                            *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
- * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
- * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
- * -------------------------------------------------------------------------- */
-#include <stdio.h>
-#include <cuda.h>
-#include <vector_functions.h>
-#include <cstdlib>
-#include <string>
-#include <iostream>
-#include <fstream>
-using namespace std;
-#include "gputypes.h"
-struct Atom {
-    float x;
-    float y;
-    float z;
-    float q;
-    float br;
-    float fx;
-    float fy;
-    float fz;
-    float fb;
-};
-__shared__ Atom sA[GT2XX_NONBOND_THREADS_PER_BLOCK];
-__shared__ unsigned int sWorkUnit[GT2XX_NONBOND_WORKUNITS_PER_SM];
-__shared__ unsigned int sNext[GRID];
-static __constant__ cudaGmxSimulation cSim;
-void SetCalculateObcGbsaForces1_12Sim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
-    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
-}
-void GetCalculateObcGbsaForces1_12Sim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
-    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
-}
-__global__ void kCalculateObcGbsaForces1_12_kernel()
-{
-    // Read queue of work blocks once so the remainder of
-    // kernel can run asynchronously    
-    int pos = cSim.nbWorkUnitsPerBlock * blockIdx.x + min(blockIdx.x, cSim.nbWorkUnitsPerBlockRemainder);
-    int end = cSim.nbWorkUnitsPerBlock * (blockIdx.x + 1) + min((blockIdx.x + 1), cSim.nbWorkUnitsPerBlockRemainder);    
-    if (threadIdx.x < end - pos)
-    {
-        sWorkUnit[threadIdx.x] = cSim.pWorkUnit[pos + threadIdx.x];
-    }
-    if (threadIdx.x < GRID)
-    {
-        sNext[threadIdx.x] = (threadIdx.x + 1) & (GRID - 1);
-    }
-    __syncthreads();
-    // Now change pos and end to reflect work queue just read
-    // into shared memory
-    end = end - pos; 
-    pos = end - (threadIdx.x >> GRIDBITS) - 1;
-    while (pos >= 0)
-    {  
-        // Extract cell coordinates from appropriate work unit
-        unsigned int x = sWorkUnit[pos];
-        unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
-        x = (x >> 17) << GRIDBITS;
-        float4      apos;   // Local atom x, y, z, q
-        float4      af;     // Local atom fx, fy, fz, fb
-        unsigned int tgx = threadIdx.x & (GRID - 1);
-        unsigned int tbx = threadIdx.x - tgx;
-        int tj = tgx; 
-        Atom* psA = &sA[tbx];
-        if (x == y) // Handle diagonals uniquely at 50% efficiency
-        { 
-            // Read fixed atom data into registers and GRF
-            unsigned int i          = x + tgx;
-            apos                    = cSim.pPosq[i];
-            float br                = cSim.pBornRadii[i];
-            sA[threadIdx.x].x       = apos.x;
-            sA[threadIdx.x].y       = apos.y;
-            sA[threadIdx.x].z       = apos.z;
-            sA[threadIdx.x].q       = apos.w;
-            sA[threadIdx.x].br      = br;
-            af.x                    = 0.0f;
-            af.y                    = 0.0f;
-            af.z                    = 0.0f;
-            af.w                    = 0.0f;
-            apos.w                 *= cSim.preFactor;
-            for (unsigned int j = 0; j < GRID; j++)
-            {
-                float dx                = psA[j].x - apos.x; 
-                float dy                = psA[j].y - apos.y; 
-                float dz                = psA[j].z - apos.z; 
-                float r2                = dx * dx + dy * dy + dz * dz; 
-                float alpha2_ij         = br * psA[j].br; 
-                float D_ij              = r2 / (4.0f * alpha2_ij); 
-                float expTerm           = exp(-D_ij); 
-                float denominator2      = r2 + alpha2_ij * expTerm; 
-                float denominator       = sqrt(denominator2); 
-                float Gpol              = (apos.w * psA[j].q) / (denominator * denominator2); 
-                float dGpol_dr          = Gpol * (1.0f - 0.25f * expTerm); 
-                float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij); 
-                dx                     *= dGpol_dr; 
-                dy                     *= dGpol_dr; 
-                dz                     *= dGpol_dr; 
-                af.x                   -= dx; 
-                af.y                   -= dy; 
-                af.z                   -= dz; 
-                af.w                   += dGpol_dalpha2_ij * psA[j].br;      
-            }
-            // Write results
-            int offset                          = x + tgx + (x >> GRIDBITS) * cSim.stride;
-            cSim.pForce4a[offset]               = af;
-            cSim.pBornForce[offset]             = af.w;
-        }         
-        else        // 100% utilization
-        {
-            // Read fixed atom data into registers and GRF
-            int j                   = y + tgx;
-            unsigned int i          = x + tgx;
-            float4 temp             = cSim.pPosq[j];
-            float temp1             = cSim.pBornRadii[j];
-            apos                    = cSim.pPosq[i];
-            float br                = cSim.pBornRadii[i];
-            sA[threadIdx.x].x       = temp.x;
-            sA[threadIdx.x].y       = temp.y;
-            sA[threadIdx.x].z       = temp.z;
-            sA[threadIdx.x].q       = temp.w;
-            sA[threadIdx.x].br      = temp1;
-            sA[threadIdx.x].fx      = af.x = 0.0f;
-            sA[threadIdx.x].fy      = af.y = 0.0f;
-            sA[threadIdx.x].fz      = af.z = 0.0f;
-            sA[threadIdx.x].fb      = af.w = 0.0f;
-            apos.w                 *= cSim.preFactor;
-            for (j = 0; j < GRID; j++)
-            {   
-                float dx                = psA[tj].x - apos.x; 
-                float dy                = psA[tj].y - apos.y; 
-                float dz                = psA[tj].z - apos.z; 
-                float r2                = dx * dx + dy * dy + dz * dz; 
-                float alpha2_ij         = br * psA[tj].br; 
-                float D_ij              = r2 / (4.0f * alpha2_ij); 
-                float expTerm           = exp(-D_ij); 
-                float denominator2      = r2 + alpha2_ij * expTerm; 
-                float denominator       = sqrt(denominator2); 
-                float Gpol              = (apos.w * psA[tj].q) / (denominator * denominator2); 
-                float dGpol_dr          = Gpol * (1.0f - 0.25f * expTerm); 
-                float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij); 
-                dx                     *= dGpol_dr; 
-                dy                     *= dGpol_dr; 
-                dz                     *= dGpol_dr; 
-                af.x                   -= dx; 
-                af.y                   -= dy; 
-                af.z                   -= dz; 
-                psA[tj].fx             += dx; 
-                psA[tj].fy             += dy; 
-                psA[tj].fz             += dz; 
-                af.w                   += dGpol_dalpha2_ij * psA[tj].br; 
-                psA[tj].fb             += dGpol_dalpha2_ij * br;        
-                tj                      = sNext[tj]; 
-            }
-            // Write results
-            int offset                          = x + tgx + (y >> GRIDBITS) * cSim.stride;
-            cSim.pForce4a[offset]               = af;
-            cSim.pBornForce[offset]             = af.w;
-            offset                              = y + tgx + (x >> GRIDBITS) * cSim.stride;
-            af.x                                = sA[threadIdx.x].fx;
-            af.y                                = sA[threadIdx.x].fy;
-            af.z                                = sA[threadIdx.x].fz;
-            af.w                                = sA[threadIdx.x].fb;
-            offset                              = y + tgx + (x >> GRIDBITS) * cSim.stride;
-            cSim.pForce4a[offset]               = af;
-            cSim.pBornForce[offset]             = af.w;
-        }
-        pos -= cSim.nonbond_workBlock;     
-    }
-}
-void kCalculateObcGbsaForces1_12(gpuContext gpu)
-{
-  //  printf("kCalculateObcGbsaForces1_12\n");
-    kCalculateObcGbsaForces1_12_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
-    LAUNCHERROR("kCalculateObcGbsaForce1_12");
-}
--- a/platforms/cuda/src/kernels/kForces.cu
+++ b/platforms/cuda/src/kernels/kForces.cu
@@ -259,3 +259,64 @@ void kReduceForces(gpuContext gpu)
    LAUNCHERROR("kReduceForces");
 }
+__global__ void kReduceObcGbsaBornForces_kernel()
+{
+    unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
+    while (pos < cSim.atoms)
+    {
+        float bornRadius = cSim.pBornRadii[pos];
+        float obcChain   = cSim.pObcChain[pos];
+        float2 obcData   = cSim.pObcData[pos];
+        float totalForce = 0.0f;
+        float* pFt = cSim.pBornForce + pos;
+        int i = cSim.nonbondOutputBuffers;
+        while (i >= 4)
+        {
+            float f1    = *pFt;
+            pFt        += cSim.stride;
+            float f2    = *pFt;
+            pFt        += cSim.stride;
+            float f3    = *pFt;
+            pFt        += cSim.stride;
+            float f4    = *pFt;
+            pFt        += cSim.stride;
+            totalForce += f1 + f2 + f3 + f4;
+            i -= 4;
+        }
+        if (i >= 2)
+        {
+            float f1    = *pFt;
+            pFt        += cSim.stride;
+            float f2    = *pFt;
+            pFt        += cSim.stride;
+            totalForce += f1 + f2;
+            i -= 2;
+        }
+        if (i > 0)
+        {
+            totalForce += *pFt;
+        }
+        float r            = (obcData.x + cSim.dielectricOffset + cSim.probeRadius);
+        float ratio6       = pow((obcData.x + cSim.dielectricOffset) / bornRadius, 6.0f);
+        //float saTerm       = cSim.surfaceAreaFactor * r * r * ratio6;
+        float saTerm       = cSim.surfaceAreaFactor * r * r * ratio6;
+        totalForce        += saTerm / bornRadius; // 1.102 == Temp mysterious fudge factor, FIX FIX FIX
+        totalForce *= bornRadius * bornRadius * obcChain;
+        pFt = cSim.pBornForce + pos;
+        *pFt = totalForce;
+        pos += gridDim.x * blockDim.x;
+    }
+}
+void kReduceObcGbsaBornForces(gpuContext gpu)
+{
+    //printf("kReduceObcGbsaBornForces\n");
+    kReduceObcGbsaBornForces_kernel<<<gpu->sim.blocks, gpu->sim.bf_reduce_threads_per_block>>>();
+    LAUNCHERROR("kReduceObcGbsaBornForces");
+}