Merge pull request #2085 from peastman/charges

Support multiple NonbondedForces in one System

Merge pull request #2085 from peastman/charges
Support multiple NonbondedForces in one System
73cac8e6 · peastman · GitHub · a885a6ae · daa9de8a · 73cac8e6
Unverified Commit 73cac8e6 authored Jun 08, 2018 by peastman Committed by GitHub Jun 08, 2018
20 changed files
--- a/platforms/cpu/include/CpuKernels.h
+++ b/platforms/cpu/include/CpuKernels.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2013-2016 Stanford University and the Authors.      *
+ * Portions copyright (c) 2013-2018 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -268,7 +268,7 @@ public:
 private:
    class PmeIO;
    CpuPlatform::PlatformData& data;
-    int numParticles, num14;
+    int numParticles, num14, posqIndex;
    int **bonded14IndexArray;
    double **bonded14ParamArray;
    double nonbondedCutoff, switchingDistance, rfDielectric, ewaldAlpha, ewaldDispersionAlpha, ewaldSelfEnergy, dispersionCoefficient;
@@ -277,6 +277,7 @@ private:
    std::vector<std::set<int> > exclusions;
    std::vector<std::pair<float, float> > particleParams;
    std::vector<float> C6params;
+    std::vector<float> charges;
    NonbondedMethod nonbondedMethod;
    CpuNonbondedForce* nonbonded;
    Kernel optimizedPme, optimizedDispersionPme;
@@ -363,7 +364,9 @@ public:
    void copyParametersToContext(ContextImpl& context, const GBSAOBCForce& force);
 private:
    CpuPlatform::PlatformData& data;
+    int posqIndex;
    std::vector<std::pair<float, float> > particleParams;
+    std::vector<float> charges;
    CpuGBSAOBCForce obc;
 };

--- a/platforms/cpu/include/CpuPlatform.h
+++ b/platforms/cpu/include/CpuPlatform.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2013-2016 Stanford University and the Authors.      *
+ * Portions copyright (c) 2013-2018 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -92,6 +92,7 @@ public:
    PlatformData(int numParticles, int numThreads, bool deterministicForces);
    ~PlatformData();
    void requestNeighborList(double cutoffDistance, double padding, bool useExclusions, const std::vector<std::set<int> >& exclusionList);
+    int requestPosqIndex();
    AlignedArray<float> posq;
    std::vector<AlignedArray<float> > threadForce;
    ThreadPool threads;
@@ -101,6 +102,7 @@ public:
    CpuNeighborList* neighborList;
    double cutoff, paddedCutoff;
    bool anyExclusions, deterministicForces;
+    int currentPosqIndex, nextPosqIndex;
    std::vector<std::set<int> > exclusions;
 };

--- a/platforms/cpu/src/CpuKernels.cpp
+++ b/platforms/cpu/src/CpuKernels.cpp
@@ -138,6 +138,19 @@ static double computeShiftedKineticEnergy(ContextImpl& context, vector<double>&
    return 0.5*energy;
 }
+/**
+ * Copy particle charges into the fourth element of the posq array.
+ */
+static void copyChargesToPosq(ContextImpl& context, const vector<float>& charges, int index) {
+    CpuPlatform::PlatformData& data = CpuPlatform::getPlatformData(context);
+    if (index == data.currentPosqIndex)
+        return;
+    data.currentPosqIndex = index;
+    AlignedArray<float>& posq = data.posq;
+    for (int i = 0; i < charges.size(); i++)
+        posq[4*i+3] = charges[i];
+}
 CpuCalcForcesAndEnergyKernel::CpuCalcForcesAndEnergyKernel(std::string name, const Platform& platform, CpuPlatform::PlatformData& data, ContextImpl& context) :
        CalcForcesAndEnergyKernel(name, platform), data(data) {
    // Create a Reference platform version of this kernel.
@@ -530,6 +543,7 @@ CpuCalcNonbondedForceKernel::~CpuCalcNonbondedForceKernel() {
 }
 void CpuCalcNonbondedForceKernel::initialize(const System& system, const NonbondedForce& force) {
+    posqIndex = data.requestPosqIndex();
    // Identify which exceptions are 1-4 interactions.
@@ -556,12 +570,13 @@ void CpuCalcNonbondedForceKernel::initialize(const System& system, const Nonbond
    for (int i = 0; i < num14; i++)
        bonded14ParamArray[i] = new double[3];
    particleParams.resize(numParticles);
+    charges.resize(numParticles);
    C6params.resize(numParticles);
    double sumSquaredCharges = 0.0;
    for (int i = 0; i < numParticles; ++i) {
        double charge, radius, depth;
        force.getParticleParameters(i, charge, radius, depth);
-        data.posq[4*i+3] = (float) charge;
+        charges[i] = (float) charge;
        particleParams[i] = make_pair((float) (0.5*radius), (float) (2.0*sqrt(depth)));
        C6params[i] = 8.0*pow(particleParams[i].first, 3.0) * particleParams[i].second;
        sumSquaredCharges += charge*charge;
@@ -660,6 +675,7 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
            }
        }
    }
+    copyChargesToPosq(context, charges, posqIndex);
    AlignedArray<float>& posq = data.posq;
    vector<Vec3>& posData = extractPositions(context);
    vector<Vec3>& forceData = extractForces(context);
@@ -726,11 +742,12 @@ void CpuCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context,
    // Record the values.
+    posqIndex = data.requestPosqIndex();
    double sumSquaredCharges = 0.0;
    for (int i = 0; i < numParticles; ++i) {
        double charge, radius, depth;
        force.getParticleParameters(i, charge, radius, depth);
-        data.posq[4*i+3] = (float) charge;
+        charges[i] = (float) charge;
        particleParams[i] = make_pair((float) (0.5*radius), (float) (2.0*sqrt(depth)));
        sumSquaredCharges += charge*charge;
    }
@@ -964,12 +981,14 @@ CpuCalcGBSAOBCForceKernel::~CpuCalcGBSAOBCForceKernel() {
 }
 void CpuCalcGBSAOBCForceKernel::initialize(const System& system, const GBSAOBCForce& force) {
+    posqIndex = data.requestPosqIndex();
    int numParticles = system.getNumParticles();
    particleParams.resize(numParticles);
+    charges.resize(numParticles);
    for (int i = 0; i < numParticles; ++i) {
        double charge, radius, scalingFactor;
        force.getParticleParameters(i, charge, radius, scalingFactor);
-        data.posq[4*i+3] = (float) charge;
+        charges[i] = (float) charge;
        radius -= 0.009;
        particleParams[i] = make_pair((float) radius, (float) (scalingFactor*radius));
    }
@@ -983,6 +1002,7 @@ void CpuCalcGBSAOBCForceKernel::initialize(const System& system, const GBSAOBCFo
 }
 double CpuCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
+    copyChargesToPosq(context, charges, posqIndex);
    if (data.isPeriodic) {
        Vec3& boxSize = extractBoxSize(context);
        float floatBoxSize[3] = {(float) boxSize[0], (float) boxSize[1], (float) boxSize[2]};
@@ -1000,10 +1020,11 @@ void CpuCalcGBSAOBCForceKernel::copyParametersToContext(ContextImpl& context, co
    // Record the values.
+    posqIndex = data.requestPosqIndex();
    for (int i = 0; i < numParticles; ++i) {
        double charge, radius, scalingFactor;
        force.getParticleParameters(i, charge, radius, scalingFactor);
-        data.posq[4*i+3] = (float) charge;
+        charges[i] = (float) charge;
        radius -= 0.009;
        particleParams[i] = make_pair((float) radius, (float) (scalingFactor*radius));
    }

--- a/platforms/cpu/src/CpuPlatform.cpp
+++ b/platforms/cpu/src/CpuPlatform.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2013-2016 Stanford University and the Authors.      *
+ * Portions copyright (c) 2013-2018 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -147,7 +147,7 @@ const CpuPlatform::PlatformData& CpuPlatform::getPlatformData(const ContextImpl&
 }
 CpuPlatform::PlatformData::PlatformData(int numParticles, int numThreads, bool deterministicForces) : posq(4*numParticles), threads(numThreads),
-        deterministicForces(deterministicForces), neighborList(NULL), cutoff(0.0), paddedCutoff(0.0), anyExclusions(false) {
+        deterministicForces(deterministicForces), neighborList(NULL), cutoff(0.0), paddedCutoff(0.0), anyExclusions(false), currentPosqIndex(-1), nextPosqIndex(0) {
    numThreads = threads.getNumThreads();
    threadForce.resize(numThreads);
    for (int i = 0; i < numThreads; i++)
@@ -184,3 +184,7 @@ void CpuPlatform::PlatformData::requestNeighborList(double cutoffDistance, doubl
    else if (!anyExclusions)
        exclusions = exclusionList;
 }
+int CpuPlatform::PlatformData::requestPosqIndex() {
+    return nextPosqIndex++;
+}
\ No newline at end of file
--- a/platforms/cuda/include/CudaContext.h
+++ b/platforms/cuda/include/CudaContext.h
@@ -503,6 +503,11 @@ public:
     * Set the particle charges.  These are packed into the fourth element of the posq array.
     */
    void setCharges(const std::vector<double>& charges);
+    /**
+     * Request to use the fourth element of the posq array for storing charges.  Since only one force can
+     * do that, this returns true the first time it is called, and false on all subsequent calls.
+     */
+    bool requestPosqCharges();
    /**
     * Get the thread used by this context for executing parallel computations.
     */
@@ -626,7 +631,7 @@ private:
    int paddedNumAtoms;
    int numAtomBlocks;
    int numThreadBlocks;
-    bool useBlockingSync, useDoublePrecision, useMixedPrecision, contextIsValid, atomsWereReordered, boxIsTriclinic, hasCompilerKernel, isNvccAvailable, forcesValid;
+    bool useBlockingSync, useDoublePrecision, useMixedPrecision, contextIsValid, atomsWereReordered, boxIsTriclinic, hasCompilerKernel, isNvccAvailable, forcesValid, hasAssignedPosqCharges;
    bool isLinkedContext;
    std::string compiler, tempDir, cacheDir, gpuArchitecture;
    float4 periodicBoxVecXFloat, periodicBoxVecYFloat, periodicBoxVecZFloat, periodicBoxSizeFloat, invPeriodicBoxSizeFloat;

--- a/platforms/cuda/include/CudaKernels.h
+++ b/platforms/cuda/include/CudaKernels.h
@@ -675,6 +675,7 @@ private:
    CudaContext& cu;
    ForceInfo* info;
    bool hasInitializedFFT;
+    CudaArray charges;
    CudaArray sigmaEpsilon;
    CudaArray exceptionParams;
    CudaArray cosSinSums;
@@ -719,7 +720,7 @@ private:
    int interpolateForceThreads;
    int gridSizeX, gridSizeY, gridSizeZ;
    int dispersionGridSizeX, dispersionGridSizeY, dispersionGridSizeZ;
-    bool hasCoulomb, hasLJ, usePmeStream, useCudaFFT, doLJPME;
+    bool hasCoulomb, hasLJ, usePmeStream, useCudaFFT, doLJPME, usePosqCharges;
    NonbondedMethod nonbondedMethod;
    static const int PmeOrder = 5;
 };
@@ -815,6 +816,7 @@ private:
    int maxTiles;
    CudaContext& cu;
    ForceInfo* info;
+    CudaArray charges;
    CudaArray params;
    CudaArray bornSum;
    CudaArray bornRadii;

--- a/platforms/cuda/src/CudaContext.cpp
+++ b/platforms/cuda/src/CudaContext.cpp
@@ -107,8 +107,8 @@ static int executeInWindows(const string &command) {
 CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& compiler,
        const string& tempDir, const std::string& hostCompiler, CudaPlatform::PlatformData& platformData, CudaContext* originalContext) : system(system), currentStream(0),
-        time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), stepsSinceReorder(99999), contextIsValid(false), atomsWereReordered(false), hasCompilerKernel(false), isNvccAvailable(false),
+        time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), stepsSinceReorder(99999), contextIsValid(false), atomsWereReordered(false), hasAssignedPosqCharges(false),
-        pinnedBuffer(NULL), integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL), thread(NULL) {
+        hasCompilerKernel(false), isNvccAvailable(false), pinnedBuffer(NULL), integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL), thread(NULL) {
    // Determine what compiler to use.
    this->compiler = "\""+compiler+"\"";
@@ -301,6 +301,11 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
        compilationDefines["make_mixed4"] = "make_float4";
    }
    posCellOffsets.resize(paddedNumAtoms, make_int4(0, 0, 0, 0));
+    atomIndexDevice.initialize<int>(*this, paddedNumAtoms, "atomIndex");
+    atomIndex.resize(paddedNumAtoms);
+    for (int i = 0; i < paddedNumAtoms; ++i)
+        atomIndex[i] = i;
+    atomIndexDevice.upload(atomIndex);
    // Create utility kernels that are used in multiple places.
@@ -475,11 +480,6 @@ void CudaContext::initialize() {
            energyParamDerivBuffer.initialize<float>(*this, numEnergyParamDerivs*numEnergyBuffers, "energyParamDerivBuffer");
        addAutoclearBuffer(energyParamDerivBuffer);
    }
-    atomIndexDevice.initialize<int>(*this, paddedNumAtoms, "atomIndex");
-    atomIndex.resize(paddedNumAtoms);
-    for (int i = 0; i < paddedNumAtoms; ++i)
-        atomIndex[i] = i;
-    atomIndexDevice.upload(atomIndex);
    findMoleculeGroups();
    nonbonded->initialize(system);
 }
@@ -887,14 +887,14 @@ void CudaContext::setCharges(const vector<double>& charges) {
    if (!chargeBuffer.isInitialized())
        chargeBuffer.initialize(*this, numAtoms, useDoublePrecision ? sizeof(double) : sizeof(float), "chargeBuffer");
    if (getUseDoublePrecision()) {
-        double* c = (double*) getPinnedBuffer();
+        vector<double> c(numAtoms);
-        for (int i = 0; i < charges.size(); i++)
+        for (int i = 0; i < numAtoms; i++)
            c[i] = charges[i];
        chargeBuffer.upload(c);
    }
    else {
-        float* c = (float*) getPinnedBuffer();
+        vector<float> c(numAtoms);
-        for (int i = 0; i < charges.size(); i++)
+        for (int i = 0; i < numAtoms; i++)
            c[i] = (float) charges[i];
        chargeBuffer.upload(c);
    }
@@ -902,6 +902,12 @@ void CudaContext::setCharges(const vector<double>& charges) {
    executeKernel(setChargesKernel, args, numAtoms);
 }
+bool CudaContext::requestPosqCharges() {
+    bool allow = !hasAssignedPosqCharges;
+    hasAssignedPosqCharges = true;
+    return allow;
+}
 /**
 * This class ensures that atom reordering doesn't break virtual sites.
 */

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
--- a/platforms/cuda/src/kernels/coulombLennardJones.cu
+++ b/platforms/cuda/src/kernels/coulombLennardJones.cu
@@ -4,7 +4,11 @@
    unsigned int includeInteraction = ((!isExcluded && r2 < CUTOFF_SQUARED) || needCorrection);
    const real alphaR = EWALD_ALPHA*r;
    const real expAlphaRSqr = EXP(-alphaR*alphaR);
-    const real prefactor = 138.935456f*posq1.w*posq2.w*invR;
+#if HAS_COULOMB
+    const real prefactor = 138.935456f*CHARGE1*CHARGE2*invR;
+#else
+    const real prefactor = 0.0f;
+#endif
 #ifdef USE_DOUBLE_PRECISION
    const real erfcAlphaR = erfc(alphaR);
@@ -30,7 +34,7 @@
        const real dar6 = dar4*dar2;
        const real invR2 = invR*invR;
        const real expDar2 = EXP(-dar2);
-        const float2 sigExpProd = sigmaEpsilon1*sigmaEpsilon2;
+        const float2 sigExpProd = SIGMA_EPSILON1*SIGMA_EPSILON2;
        const real c6 = 64*sigExpProd.x*sigExpProd.x*sigExpProd.x*sigExpProd.y;
        const real coef = invR2*invR2*invR2*c6;
        const real eprefac = 1.0f + dar2 + 0.5f*dar4;
@@ -40,6 +44,7 @@
    if (needCorrection) {
        // Subtract off the part of this interaction that was included in the reciprocal space contribution.
+#if HAS_COULOMB
        if (1-erfcAlphaR > 1e-6) {
            real erfAlphaR = ERF(alphaR); // Our erfc approximation is not accurate enough when r is very small, which happens with Drude particles.
            tempForce = -prefactor*(erfAlphaR-alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
@@ -47,8 +52,9 @@
        }
        else {
            includeInteraction = false;
-            tempEnergy -= TWO_OVER_SQRT_PI*EWALD_ALPHA*138.935456f*posq1.w*posq2.w;
+            tempEnergy -= TWO_OVER_SQRT_PI*EWALD_ALPHA*138.935456f*CHARGE1*CHARGE2;
        }
+#endif
 #if HAS_LENNARD_JONES
        #if DO_LJPME
            // The multiplicative grid term
@@ -59,11 +65,11 @@
    }
    else {
 #if HAS_LENNARD_JONES
-        real sig = sigmaEpsilon1.x + sigmaEpsilon2.x;
+        real sig = SIGMA_EPSILON1.x + SIGMA_EPSILON2.x;
        real sig2 = invR*sig;
        sig2 *= sig2;
        real sig6 = sig2*sig2*sig2;
-        real eps = sigmaEpsilon1.y*sigmaEpsilon2.y;
+        real eps = SIGMA_EPSILON1.y*SIGMA_EPSILON2.y;
        real epssig6 = sig6*eps;
        tempForce = epssig6*(12.0f*sig6 - 6.0f);
        real ljEnergy = epssig6*(sig6 - 1.0f);
@@ -108,11 +114,11 @@
 #endif
    real tempForce = 0.0f;
  #if HAS_LENNARD_JONES
-    real sig = sigmaEpsilon1.x + sigmaEpsilon2.x;
+    real sig = SIGMA_EPSILON1.x + SIGMA_EPSILON2.x;
    real sig2 = invR*sig;
    sig2 *= sig2;
    real sig6 = sig2*sig2*sig2;
-    real epssig6 = sig6*(sigmaEpsilon1.y*sigmaEpsilon2.y);
+    real epssig6 = sig6*(SIGMA_EPSILON1.y*SIGMA_EPSILON2.y);
    tempForce = epssig6*(12.0f*sig6 - 6.0f);
    real ljEnergy = includeInteraction ? epssig6*(sig6 - 1) : 0;
    #if USE_LJ_SWITCH
@@ -128,11 +134,11 @@
  #endif
 #if HAS_COULOMB
  #ifdef USE_CUTOFF
-    const real prefactor = 138.935456f*posq1.w*posq2.w;
+    const real prefactor = 138.935456f*CHARGE1*CHARGE2;
    tempForce += prefactor*(invR - 2.0f*REACTION_FIELD_K*r2);
    tempEnergy += includeInteraction ? prefactor*(invR + REACTION_FIELD_K*r2 - REACTION_FIELD_C) : 0;
  #else
-    const real prefactor = 138.935456f*posq1.w*posq2.w*invR;
+    const real prefactor = 138.935456f*CHARGE1*CHARGE2*invR;
    tempForce += prefactor;
    tempEnergy += includeInteraction ? prefactor : 0;
  #endif

--- a/platforms/cuda/src/kernels/gbsaObc1.cu
+++ b/platforms/cuda/src/kernels/gbsaObc1.cu
@@ -66,7 +66,7 @@ typedef struct {
 /**
 * Compute the Born sum.
 */
-extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ global_bornSum, const real4* __restrict__ posq, const float2* __restrict__ global_params,
+extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ global_bornSum, const real4* __restrict__ posq, const real* __restrict__ charge, const float2* __restrict__ global_params,
 #ifdef USE_CUTOFF
        const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, const real4* __restrict__ blockCenter,
@@ -92,6 +92,7 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
        real bornSum = 0;
        unsigned int atom1 = x*TILE_SIZE + tgx;
        real4 posq1 = posq[atom1];
+        real charge1 = charge[atom1];
        float2 params1 = global_params[atom1];
        if (x == y) {
            // This tile is on the diagonal.
@@ -99,7 +100,7 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
            localData[threadIdx.x].x = posq1.x;
            localData[threadIdx.x].y = posq1.y;
            localData[threadIdx.x].z = posq1.z;
-            localData[threadIdx.x].q = posq1.w;
+            localData[threadIdx.x].q = charge1;
            localData[threadIdx.x].radius = params1.x;
            localData[threadIdx.x].scaledRadius = params1.y;
            for (unsigned int j = 0; j < TILE_SIZE; j++) {
@@ -138,7 +139,7 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
            localData[threadIdx.x].x = tempPosq.x;
            localData[threadIdx.x].y = tempPosq.y;
            localData[threadIdx.x].z = tempPosq.z;
-            localData[threadIdx.x].q = tempPosq.w;
+            localData[threadIdx.x].q = charge[j];
            float2 tempParams = global_params[j];
            localData[threadIdx.x].radius = tempParams.x;
            localData[threadIdx.x].scaledRadius = tempParams.y;
@@ -262,6 +263,7 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
            // Load atom data for this tile.
            real4 posq1 = posq[atom1];
+            real charge1 = charge[atom1];
            float2 params1 = global_params[atom1];
 #ifdef USE_CUTOFF
            unsigned int j = interactingAtoms[pos*TILE_SIZE+tgx];
@@ -274,7 +276,7 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
                localData[threadIdx.x].x = tempPosq.x;
                localData[threadIdx.x].y = tempPosq.y;
                localData[threadIdx.x].z = tempPosq.z;
-                localData[threadIdx.x].q = tempPosq.w;
+                localData[threadIdx.x].q = charge[j];
                float2 tempParams = global_params[j];
                localData[threadIdx.x].radius = tempParams.x;
                localData[threadIdx.x].scaledRadius = tempParams.y;
@@ -400,7 +402,7 @@ typedef struct {
 */
 extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ forceBuffers, unsigned long long* __restrict__ global_bornForce,
-        mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, const real* __restrict__ global_bornRadii, bool needEnergy,
+        mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, const real* __restrict__ charge, const real* __restrict__ global_bornRadii, bool needEnergy,
 #ifdef USE_CUTOFF
        const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, const real4* __restrict__ blockCenter,
@@ -427,6 +429,7 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
        real4 force = make_real4(0);
        unsigned int atom1 = x*TILE_SIZE + tgx;
        real4 posq1 = posq[atom1];
+        real charge1 = charge[atom1];
        real bornRadius1 = global_bornRadii[atom1];
        if (x == y) {
            // This tile is on the diagonal.
@@ -434,12 +437,13 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
            localData[threadIdx.x].x = posq1.x;
            localData[threadIdx.x].y = posq1.y;
            localData[threadIdx.x].z = posq1.z;
-            localData[threadIdx.x].q = posq1.w;
+            localData[threadIdx.x].q = charge1;
            localData[threadIdx.x].bornRadius = bornRadius1;
            for (unsigned int j = 0; j < TILE_SIZE; j++) {
                if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
-                    real4 posq2 = make_real4(localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z, localData[tbx+j].q);
+                    real3 pos2 = make_real3(localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z);
-                    real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
+                    real charge2 = localData[tbx+j].q;
+                    real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
 #ifdef USE_PERIODIC
                    APPLY_PERIODIC_TO_DELTA(delta)
 #endif
@@ -455,7 +459,7 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
                        real expTerm = EXP(-D_ij);
                        real denominator2 = r2 + alpha2_ij*expTerm;
                        real denominator = SQRT(denominator2);
-                        real scaledChargeProduct = PREFACTOR*posq1.w*posq2.w;
+                        real scaledChargeProduct = PREFACTOR*charge1*charge2;
                        real tempEnergy = scaledChargeProduct*RECIP(denominator);
                        real Gpol = tempEnergy*RECIP(denominator2);
                        real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
@@ -485,7 +489,7 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
            localData[threadIdx.x].x = tempPosq.x;
            localData[threadIdx.x].y = tempPosq.y;
            localData[threadIdx.x].z = tempPosq.z;
-            localData[threadIdx.x].q = tempPosq.w;
+            localData[threadIdx.x].q = charge[j];
            localData[threadIdx.x].bornRadius = global_bornRadii[j];
            localData[threadIdx.x].fx = 0.0f;
            localData[threadIdx.x].fy = 0.0f;
@@ -494,8 +498,9 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
            unsigned int tj = tgx;
            for (j = 0; j < TILE_SIZE; j++) {
                if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS) {
-                    real4 posq2 = make_real4(localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z, localData[tbx+tj].q);
+                    real3 pos2 = make_real3(localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z);
-                    real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
+                    real charge2 = localData[tbx+tj].q;
+                    real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
 #ifdef USE_PERIODIC
                    APPLY_PERIODIC_TO_DELTA(delta)
 #endif
@@ -511,7 +516,7 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
                        real expTerm = EXP(-D_ij);
                        real denominator2 = r2 + alpha2_ij*expTerm;
                        real denominator = SQRT(denominator2);
-                        real scaledChargeProduct = PREFACTOR*posq1.w*posq2.w;
+                        real scaledChargeProduct = PREFACTOR*charge1*charge2;
                        real tempEnergy = scaledChargeProduct*RECIP(denominator);
                        real Gpol = tempEnergy*RECIP(denominator2);
                        real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
@@ -617,6 +622,7 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
            // Load atom data for this tile.
            real4 posq1 = posq[atom1];
+            real charge1 = charge[atom1];
            real bornRadius1 = global_bornRadii[atom1];
 #ifdef USE_CUTOFF
            unsigned int j = interactingAtoms[pos*TILE_SIZE+tgx];
@@ -629,7 +635,7 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
                localData[threadIdx.x].x = tempPosq.x;
                localData[threadIdx.x].y = tempPosq.y;
                localData[threadIdx.x].z = tempPosq.z;
-                localData[threadIdx.x].q = tempPosq.w;
+                localData[threadIdx.x].q = charge[j];
                localData[threadIdx.x].bornRadius = global_bornRadii[j];
                localData[threadIdx.x].fx = 0.0f;
                localData[threadIdx.x].fy = 0.0f;
@@ -648,8 +654,9 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
                for (j = 0; j < TILE_SIZE; j++) {
                    int atom2 = atomIndices[tbx+tj];
                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-                        real4 posq2 = make_real4(localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z, localData[tbx+tj].q);
+                        real3 pos2 = make_real3(localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z);
-                        real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
+                        real charge2 = localData[tbx+tj].q;
+                        real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
                        real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
                        if (r2 < CUTOFF_SQUARED) {
                            real invR = RSQRT(r2);
@@ -660,7 +667,7 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
                            real expTerm = EXP(-D_ij);
                            real denominator2 = r2 + alpha2_ij*expTerm;
                            real denominator = SQRT(denominator2);
-                            real scaledChargeProduct = PREFACTOR*posq1.w*posq2.w;
+                            real scaledChargeProduct = PREFACTOR*charge1*charge2;
                            real tempEnergy = scaledChargeProduct*RECIP(denominator);
                            real Gpol = tempEnergy*RECIP(denominator2);
                            real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
@@ -693,8 +700,9 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
                for (j = 0; j < TILE_SIZE; j++) {
                    int atom2 = atomIndices[tbx+tj];
                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-                        real4 posq2 = make_real4(localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z, localData[tbx+tj].q);
+                        real3 pos2 = make_real3(localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z);
-                        real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
+                        real charge2 = localData[tbx+tj].q;
+                        real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
 #ifdef USE_PERIODIC
                        APPLY_PERIODIC_TO_DELTA(delta)
 #endif
@@ -710,7 +718,7 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
                            real expTerm = EXP(-D_ij);
                            real denominator2 = r2 + alpha2_ij*expTerm;
                            real denominator = SQRT(denominator2);
-                            real scaledChargeProduct = PREFACTOR*posq1.w*posq2.w;
+                            real scaledChargeProduct = PREFACTOR*charge1*charge2;
                            real tempEnergy = scaledChargeProduct*RECIP(denominator);
                            real Gpol = tempEnergy*RECIP(denominator2);
                            real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);

--- a/platforms/cuda/src/kernels/gbsaObc2.cu
+++ b/platforms/cuda/src/kernels/gbsaObc2.cu
 {
    real invRSquaredOver4 = 0.25f*invR*invR;
-    real rScaledRadiusJ = r+obcParams2.y;
+    real rScaledRadiusJ = r+OBC_PARAMS2.y;
-    real rScaledRadiusI = r+obcParams1.y;
+    real rScaledRadiusI = r+OBC_PARAMS1.y;
-    real l_ijJ = RECIP(max(obcParams1.x, fabs(r-obcParams2.y)));
+    real l_ijJ = RECIP(max(OBC_PARAMS1.x, fabs(r-OBC_PARAMS2.y)));
-    real l_ijI = RECIP(max(obcParams2.x, fabs(r-obcParams1.y)));
+    real l_ijI = RECIP(max(OBC_PARAMS2.x, fabs(r-OBC_PARAMS1.y)));
    real u_ijJ = RECIP(rScaledRadiusJ);
    real u_ijI = RECIP(rScaledRadiusI);
    real l_ij2J = l_ijJ*l_ijJ;
@@ -14,10 +14,10 @@
    real t1I = LOG(u_ijI*RECIP(l_ijI));
    real t2J = (l_ij2J-u_ij2J);
    real t2I = (l_ij2I-u_ij2I);
-    real term1 = (0.5f*(0.25f+obcParams2.y*obcParams2.y*invRSquaredOver4)*t2J + t1J*invRSquaredOver4)*invR;
+    real term1 = (0.5f*(0.25f+OBC_PARAMS2.y*OBC_PARAMS2.y*invRSquaredOver4)*t2J + t1J*invRSquaredOver4)*invR;
-    real term2 = (0.5f*(0.25f+obcParams1.y*obcParams1.y*invRSquaredOver4)*t2I + t1I*invRSquaredOver4)*invR;
+    real term2 = (0.5f*(0.25f+OBC_PARAMS1.y*OBC_PARAMS1.y*invRSquaredOver4)*t2I + t1I*invRSquaredOver4)*invR;
-    real tempdEdR = (obcParams1.x < rScaledRadiusJ ? bornForce1*term1/0x100000000 : 0);
+    real tempdEdR = (OBC_PARAMS1.x < rScaledRadiusJ ? BORN_FORCE1*term1/0x100000000 : 0);
-    tempdEdR += (obcParams2.x < rScaledRadiusI ? bornForce2*term2/0x100000000 : 0);
+    tempdEdR += (OBC_PARAMS2.x < rScaledRadiusI ? BORN_FORCE2*term2/0x100000000 : 0);
 #ifdef USE_CUTOFF
    unsigned int includeInteraction = (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2 && r2 < CUTOFF_SQUARED);
 #else

--- a/platforms/cuda/src/kernels/pme.cu
+++ b/platforms/cuda/src/kernels/pme.cu
@@ -3,8 +3,8 @@ extern "C" __global__ void findAtomGridIndex(const real4* __restrict__ posq, int
            real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
    // Compute the index of the grid point each atom is associated with.
-    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
+    for (int atom = blockIdx.x*blockDim.x+threadIdx.x; atom < NUM_ATOMS; atom += blockDim.x*gridDim.x) {
-        real4 pos = posq[i];
+        real4 pos = posq[atom];
        APPLY_PERIODIC_TO_POS(pos)
        real3 t = make_real3(pos.x*recipBoxVecX.x+pos.y*recipBoxVecY.x+pos.z*recipBoxVecZ.x,
                             pos.y*recipBoxVecY.y+pos.z*recipBoxVecZ.y,
@@ -15,7 +15,7 @@ extern "C" __global__ void findAtomGridIndex(const real4* __restrict__ posq, int
        int3 gridIndex = make_int3(((int) t.x) % GRID_SIZE_X,
                                   ((int) t.y) % GRID_SIZE_Y,
                                   ((int) t.z) % GRID_SIZE_Z);
-        pmeAtomGridIndex[i] = make_int2(i, gridIndex.x*GRID_SIZE_Y*GRID_SIZE_Z+gridIndex.y*GRID_SIZE_Z+gridIndex.z);
+        pmeAtomGridIndex[atom] = make_int2(atom, gridIndex.x*GRID_SIZE_Y*GRID_SIZE_Z+gridIndex.y*GRID_SIZE_Z+gridIndex.z);
    }
 }
@@ -24,6 +24,8 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real
        real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ, const int2* __restrict__ pmeAtomGridIndex
 #ifdef USE_LJPME
        , const float2* __restrict__ sigmaEpsilon
+#else
+        , const real* __restrict__ charges
 #endif
        ) {
    real3 data[PME_ORDER];
@@ -39,7 +41,7 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real
        const float2 sigEps = sigmaEpsilon[atom];
        const real charge = 8*sigEps.x*sigEps.x*sigEps.x*sigEps.y;
 #else
-        const real charge = pos.w;
+        const real charge = CHARGE;
 #endif
        if (charge == 0)
            continue;
@@ -253,6 +255,8 @@ void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __
        real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ, const int2* __restrict__ pmeAtomGridIndex
 #ifdef USE_LJPME
        , const float2* __restrict__ sigmaEpsilon
+#else
+        , const real* __restrict__ charges
 #endif
        ) {
    real3 data[PME_ORDER];
@@ -330,7 +334,7 @@ void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __
        const float2 sigEps = sigmaEpsilon[atom];
        real q = 8*sigEps.x*sigEps.x*sigEps.x*sigEps.y;
 #else
-        real q = pos.w*EPSILON_FACTOR;
+        real q = CHARGE*EPSILON_FACTOR;
 #endif
        real forceX = -q*(force.x*GRID_SIZE_X*recipBoxVecX.x);
        real forceY = -q*(force.x*GRID_SIZE_X*recipBoxVecY.x+force.y*GRID_SIZE_Y*recipBoxVecY.y);

--- a/platforms/opencl/include/OpenCLContext.h
+++ b/platforms/opencl/include/OpenCLContext.h
@@ -618,6 +618,11 @@ public:
     * Set the particle charges.  These are packed into the fourth element of the posq array.
     */
    void setCharges(const std::vector<double>& charges);
+    /**
+     * Request to use the fourth element of the posq array for storing charges.  Since only one force can
+     * do that, this returns true the first time it is called, and false on all subsequent calls.
+     */
+    bool requestPosqCharges();
    /**
     * Get the thread used by this context for executing parallel computations.
     */
@@ -738,7 +743,7 @@ private:
    int numThreadBlocks;
    int numForceBuffers;
    int simdWidth;
-    bool supports64BitGlobalAtomics, supportsDoublePrecision, useDoublePrecision, useMixedPrecision, atomsWereReordered, boxIsTriclinic, forcesValid;
+    bool supports64BitGlobalAtomics, supportsDoublePrecision, useDoublePrecision, useMixedPrecision, atomsWereReordered, boxIsTriclinic, forcesValid, hasAssignedPosqCharges;
    mm_float4 periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ;
    mm_double4 periodicBoxSizeDouble, invPeriodicBoxSizeDouble, periodicBoxVecXDouble, periodicBoxVecYDouble, periodicBoxVecZDouble;
    std::string defaultOptimizationOptions;

--- a/platforms/opencl/include/OpenCLKernels.h
+++ b/platforms/opencl/include/OpenCLKernels.h
@@ -652,6 +652,7 @@ private:
    OpenCLContext& cl;
    ForceInfo* info;
    bool hasInitializedKernel;
+    OpenCLArray charges;
    OpenCLArray sigmaEpsilon;
    OpenCLArray exceptionParams;
    OpenCLArray cosSinSums;
@@ -698,7 +699,7 @@ private:
    double ewaldSelfEnergy, dispersionCoefficient, alpha, dispersionAlpha;
    int gridSizeX, gridSizeY, gridSizeZ;
    int dispersionGridSizeX, dispersionGridSizeY, dispersionGridSizeZ;
-    bool hasCoulomb, hasLJ, usePmeQueue, doLJPME;
+    bool hasCoulomb, hasLJ, usePmeQueue, doLJPME, usePosqCharges;
    NonbondedMethod nonbondedMethod;
    static const int PmeOrder = 5;
 };
@@ -795,6 +796,7 @@ private:
    OpenCLContext& cl;
    ForceInfo* info;
    OpenCLArray params;
+    OpenCLArray charges;
    OpenCLArray bornSum;
    OpenCLArray longBornSum;
    OpenCLArray bornRadii;

--- a/platforms/opencl/src/OpenCLContext.cpp
+++ b/platforms/opencl/src/OpenCLContext.cpp
@@ -68,7 +68,7 @@ static void CL_CALLBACK errorCallback(const char* errinfo, const void* private_i
 }
 OpenCLContext::OpenCLContext(const System& system, int platformIndex, int deviceIndex, const string& precision, OpenCLPlatform::PlatformData& platformData, OpenCLContext* originalContext) :
-        system(system), time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), stepsSinceReorder(99999), atomsWereReordered(false),
+        system(system), time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), stepsSinceReorder(99999), atomsWereReordered(false), hasAssignedPosqCharges(false),
        integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL), thread(NULL) {
    if (precision == "single") {
        useDoublePrecision = false;
@@ -295,6 +295,11 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
            compilationDefines["convert_mixed4"] = "convert_float4";
        }
        posCellOffsets.resize(paddedNumAtoms, mm_int4(0, 0, 0, 0));
+        atomIndexDevice.initialize<cl_int>(*this, paddedNumAtoms, "atomIndexDevice");
+        atomIndex.resize(paddedNumAtoms);
+        for (int i = 0; i < paddedNumAtoms; ++i)
+            atomIndex[i] = i;
+        atomIndexDevice.upload(atomIndex);
    }
    catch (cl::Error err) {
        std::stringstream str;
@@ -494,11 +499,6 @@ void OpenCLContext::initialize() {
            ((mm_float4*) pinnedMemory)[i] = mm_float4(0.0f, 0.0f, 0.0f, mass == 0.0 ? 0.0f : (cl_float) (1.0/mass));
    }
    velm.upload(pinnedMemory);
-    atomIndexDevice.initialize<cl_int>(*this, paddedNumAtoms, "atomIndexDevice");
-    atomIndex.resize(paddedNumAtoms);
-    for (int i = 0; i < paddedNumAtoms; ++i)
-        atomIndex[i] = i;
-    atomIndexDevice.upload(atomIndex);
    findMoleculeGroups();
    nonbonded->initialize(system);
 }
@@ -770,14 +770,14 @@ void OpenCLContext::setCharges(const vector<double>& charges) {
    if (!chargeBuffer.isInitialized())
        chargeBuffer.initialize(*this, numAtoms, useDoublePrecision ? sizeof(double) : sizeof(float), "chargeBuffer");
    if (getUseDoublePrecision()) {
-        double* c = (double*) getPinnedBuffer();
+        vector<double> c(numAtoms);
-        for (int i = 0; i < charges.size(); i++)
+        for (int i = 0; i < numAtoms; i++)
            c[i] = charges[i];
        chargeBuffer.upload(c);
    }
    else {
-        float* c = (float*) getPinnedBuffer();
+        vector<float> c(numAtoms);
-        for (int i = 0; i < charges.size(); i++)
+        for (int i = 0; i < numAtoms; i++)
            c[i] = (float) charges[i];
        chargeBuffer.upload(c);
    }
@@ -788,6 +788,12 @@ void OpenCLContext::setCharges(const vector<double>& charges) {
    executeKernel(setChargesKernel, numAtoms);
 }
+bool OpenCLContext::requestPosqCharges() {
+    bool allow = !hasAssignedPosqCharges;
+    hasAssignedPosqCharges = true;
+    return allow;
+}
 /**
 * This class ensures that atom reordering doesn't break virtual sites.
 */

--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
--- a/platforms/opencl/src/kernels/coulombLennardJones.cl
+++ b/platforms/opencl/src/kernels/coulombLennardJones.cl
@@ -9,7 +9,11 @@
    includeInteraction = ((!isExcluded && r2 < CUTOFF_SQUARED) || needCorrection);
    const real alphaR = EWALD_ALPHA*r;
    const real expAlphaRSqr = EXP(-alphaR*alphaR);
-    const real prefactor = 138.935456f*posq1.w*posq2.w*invR;
+#if HAS_COULOMB
+    const real prefactor = 138.935456f*CHARGE1*CHARGE2*invR;
+#else
+    const real prefactor = 0.0f;
+#endif
 #ifdef USE_DOUBLE_PRECISION
    const real erfcAlphaR = erfc(alphaR);
@@ -35,7 +39,7 @@
        const real dar6 = dar4*dar2;
        const real invR2 = invR*invR;
        const real expDar2 = EXP(-dar2);
-        const float2 sigExpProd = sigmaEpsilon1*sigmaEpsilon2;
+        const float2 sigExpProd = SIGMA_EPSILON1*SIGMA_EPSILON2;
        const real c6 = 64*sigExpProd.x*sigExpProd.x*sigExpProd.x*sigExpProd.y;
        const real coef = invR2*invR2*invR2*c6;
        const real eprefac = 1.0f + dar2 + 0.5f*dar4;
@@ -45,6 +49,7 @@
    if (needCorrection) {
        // Subtract off the part of this interaction that was included in the reciprocal space contribution.
+#if HAS_COULOMB
        if (1-erfcAlphaR > 1e-6) {
            real erfAlphaR = erf(alphaR); // Our erfc approximation is not accurate enough when r is very small, which happens with Drude particles.
            tempForce = -prefactor*(erfAlphaR-alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
@@ -52,8 +57,9 @@
        }
        else {
            includeInteraction = false;
-            tempEnergy -= TWO_OVER_SQRT_PI*EWALD_ALPHA*138.935456f*posq1.w*posq2.w;
+            tempEnergy -= TWO_OVER_SQRT_PI*EWALD_ALPHA*138.935456f*CHARGE1*CHARGE2;
        }
+#endif
 #if HAS_LENNARD_JONES
        #if DO_LJPME
            // The multiplicative grid term
@@ -64,11 +70,11 @@
    }
    else {
 #if HAS_LENNARD_JONES
-        real sig = sigmaEpsilon1.x + sigmaEpsilon2.x;
+        real sig = SIGMA_EPSILON1.x + SIGMA_EPSILON2.x;
        real sig2 = invR*sig;
        sig2 *= sig2;
        real sig6 = sig2*sig2*sig2;
-        real eps = sigmaEpsilon1.y*sigmaEpsilon2.y;
+        real eps = SIGMA_EPSILON1.y*SIGMA_EPSILON2.y;
        real epssig6 = sig6*eps;
        tempForce = epssig6*(12.0f*sig6 - 6.0f);
        real ljEnergy = epssig6*(sig6 - 1.0f);
@@ -113,11 +119,11 @@
 #endif
    real tempForce = 0;
  #if HAS_LENNARD_JONES
-    real sig = sigmaEpsilon1.x + sigmaEpsilon2.x;
+    real sig = SIGMA_EPSILON1.x + SIGMA_EPSILON2.x;
    real sig2 = invR*sig;
    sig2 *= sig2;
    real sig6 = sig2*sig2*sig2;
-    real epssig6 = sig6*(sigmaEpsilon1.y*sigmaEpsilon2.y);
+    real epssig6 = sig6*(SIGMA_EPSILON1.y*SIGMA_EPSILON2.y);
    tempForce = epssig6*(12.0f*sig6 - 6.0f);
    real ljEnergy = epssig6*(sig6-1);
    #if USE_LJ_SWITCH
@@ -134,11 +140,11 @@
  #endif
 #if HAS_COULOMB
  #ifdef USE_CUTOFF
-    const real prefactor = 138.935456f*posq1.w*posq2.w;
+    const real prefactor = 138.935456f*CHARGE1*CHARGE2;
    tempForce += prefactor*(invR - 2.0f*REACTION_FIELD_K*r2);
    tempEnergy += select((real) 0, prefactor*(invR + REACTION_FIELD_K*r2 - REACTION_FIELD_C), includeInteraction);
  #else
-    const real prefactor = 138.935456f*posq1.w*posq2.w*invR;
+    const real prefactor = 138.935456f*CHARGE1*CHARGE2*invR;
    tempForce += prefactor;
    tempEnergy += select((real) 0, prefactor, includeInteraction);
  #endif

--- a/platforms/opencl/src/kernels/gbsaObc.cl
+++ b/platforms/opencl/src/kernels/gbsaObc.cl
@@ -19,7 +19,7 @@ __kernel void computeBornSum(
 #else
        __global real* restrict global_bornSum,
 #endif
-        __global const real4* restrict posq, __global const float2* restrict global_params,
+        __global const real4* restrict posq, __global const real* restrict charge, __global const float2* restrict global_params,
 #ifdef USE_CUTOFF
        __global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, 
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter,
@@ -45,6 +45,7 @@ __kernel void computeBornSum(
        real bornSum = 0.0f;
        unsigned int atom1 = x*TILE_SIZE + tgx;
        real4 posq1 = posq[atom1];
+        real charge1 = charge[atom1];
        float2 params1 = global_params[atom1];
        if (x == y) {
            // This tile is on the diagonal.
@@ -52,7 +53,7 @@ __kernel void computeBornSum(
            localData[get_local_id(0)].x = posq1.x;
            localData[get_local_id(0)].y = posq1.y;
            localData[get_local_id(0)].z = posq1.z;
-            localData[get_local_id(0)].q = posq1.w;
+            localData[get_local_id(0)].q = charge1;
            localData[get_local_id(0)].radius = params1.x;
            localData[get_local_id(0)].scaledRadius = params1.y;
            SYNC_WARPS;
@@ -93,7 +94,7 @@ __kernel void computeBornSum(
            localData[get_local_id(0)].x = tempPosq.x;
            localData[get_local_id(0)].y = tempPosq.y;
            localData[get_local_id(0)].z = tempPosq.z;
-            localData[get_local_id(0)].q = tempPosq.w;
+            localData[get_local_id(0)].q = charge[j];
            float2 tempParams = global_params[j];
            localData[get_local_id(0)].radius = tempParams.x;
            localData[get_local_id(0)].scaledRadius = tempParams.y;
@@ -230,6 +231,7 @@ __kernel void computeBornSum(
            // Load atom data for this tile.
            real4 posq1 = posq[atom1];
+            real charge1 = charge[atom1];
            float2 params1 = global_params[atom1];
 #ifdef USE_CUTOFF
            unsigned int j = interactingAtoms[pos*TILE_SIZE+tgx];
@@ -242,7 +244,7 @@ __kernel void computeBornSum(
                localData[get_local_id(0)].x = tempPosq.x;
                localData[get_local_id(0)].y = tempPosq.y;
                localData[get_local_id(0)].z = tempPosq.z;
-                localData[get_local_id(0)].q = tempPosq.w;
+                localData[get_local_id(0)].q = charge[j];
                float2 tempParams = global_params[j];
                localData[get_local_id(0)].radius = tempParams.x;
                localData[get_local_id(0)].scaledRadius = tempParams.y;
@@ -385,7 +387,8 @@ __kernel void computeGBSAForce1(
 #else
        __global real4* restrict forceBuffers, __global real* restrict global_bornForce,
 #endif
-        __global mixed* restrict energyBuffer, __global const real4* restrict posq, __global const real* restrict global_bornRadii, int needEnergy,
+        __global mixed* restrict energyBuffer, __global const real4* restrict posq, __global const real* restrict charge,
+        __global const real* restrict global_bornRadii, int needEnergy,
 #ifdef USE_CUTOFF
        __global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, 
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter,
@@ -412,6 +415,7 @@ __kernel void computeGBSAForce1(
        real4 force = 0.0f;
        unsigned int atom1 = x*TILE_SIZE + tgx;
        real4 posq1 = posq[atom1];
+        real charge1 = charge[atom1];
        real bornRadius1 = global_bornRadii[atom1];
        if (x == y) {
            // This tile is on the diagonal.
@@ -420,13 +424,14 @@ __kernel void computeGBSAForce1(
            localData[localAtomIndex].x = posq1.x;
            localData[localAtomIndex].y = posq1.y;
            localData[localAtomIndex].z = posq1.z;
-            localData[localAtomIndex].q = posq1.w;
+            localData[localAtomIndex].q = charge1;
            localData[get_local_id(0)].bornRadius = bornRadius1;
            SYNC_WARPS;
            for (unsigned int j = 0; j < TILE_SIZE; j++) {
                if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
-                    real4 posq2 = (real4) (localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z, localData[tbx+j].q);
+                    real3 pos2 = (real3) (localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z);
-                    real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
+                    real charge2 = localData[tbx+j].q;
+                    real4 delta = (real4) (pos2 - posq1.xyz, 0);
 #ifdef USE_PERIODIC
                    APPLY_PERIODIC_TO_DELTA(delta)
 #endif
@@ -442,7 +447,7 @@ __kernel void computeGBSAForce1(
                        real expTerm = EXP(-D_ij);
                        real denominator2 = r2 + alpha2_ij*expTerm;
                        real denominator = SQRT(denominator2);
-                        real scaledChargeProduct = PREFACTOR*posq1.w*posq2.w;
+                        real scaledChargeProduct = PREFACTOR*charge1*charge2;
                        real tempEnergy = scaledChargeProduct*RECIP(denominator);
                        real Gpol = tempEnergy*RECIP(denominator2);
                        real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
@@ -471,7 +476,7 @@ __kernel void computeGBSAForce1(
            localData[get_local_id(0)].x = tempPosq.x;
            localData[get_local_id(0)].y = tempPosq.y;
            localData[get_local_id(0)].z = tempPosq.z;
-            localData[get_local_id(0)].q = tempPosq.w;
+            localData[get_local_id(0)].q = charge[j];
            localData[get_local_id(0)].bornRadius = global_bornRadii[j];
            localData[get_local_id(0)].fx = 0.0f;
            localData[get_local_id(0)].fy = 0.0f;
@@ -481,8 +486,9 @@ __kernel void computeGBSAForce1(
            unsigned int tj = tgx;
            for (j = 0; j < TILE_SIZE; j++) {
                if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS) {
-                    real4 posq2 = (real4) (localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z, localData[tbx+tj].q);
+                    real3 pos2 = (real3) (localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z);
-                    real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
+                    real charge2 = localData[tbx+tj].q;
+                    real4 delta = (real4) (pos2 - posq1.xyz, 0);
 #ifdef USE_PERIODIC
                    APPLY_PERIODIC_TO_DELTA(delta)
 #endif
@@ -498,7 +504,7 @@ __kernel void computeGBSAForce1(
                        real expTerm = EXP(-D_ij);
                        real denominator2 = r2 + alpha2_ij*expTerm;
                        real denominator = SQRT(denominator2);
-                        real scaledChargeProduct = PREFACTOR*posq1.w*posq2.w;
+                        real scaledChargeProduct = PREFACTOR*charge1*charge2;
                        real tempEnergy = scaledChargeProduct*RECIP(denominator);
                        real Gpol = tempEnergy*RECIP(denominator2);
                        real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
@@ -617,6 +623,7 @@ __kernel void computeGBSAForce1(
            // Load atom data for this tile.
            real4 posq1 = posq[atom1];
+            real charge1 = charge[atom1];
            real bornRadius1 = global_bornRadii[atom1];
 #ifdef USE_CUTOFF
            unsigned int j = interactingAtoms[pos*TILE_SIZE+tgx];
@@ -629,7 +636,7 @@ __kernel void computeGBSAForce1(
                localData[get_local_id(0)].x = tempPosq.x;
                localData[get_local_id(0)].y = tempPosq.y;
                localData[get_local_id(0)].z = tempPosq.z;
-                localData[get_local_id(0)].q = tempPosq.w;
+                localData[get_local_id(0)].q = charge[j];
                localData[get_local_id(0)].bornRadius = global_bornRadii[j];
                localData[get_local_id(0)].fx = 0.0f;
                localData[get_local_id(0)].fy = 0.0f;
@@ -650,8 +657,9 @@ __kernel void computeGBSAForce1(
                for (j = 0; j < TILE_SIZE; j++) {
                    int atom2 = atomIndices[tbx+tj];
                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-                        real4 posq2 = (real4) (localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z, localData[tbx+tj].q);
+                        real3 pos2 = (real3) (localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z);
-                        real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
+                        real charge2 = localData[tbx+tj].q;
+                        real4 delta = (real4) (pos2 - posq1.xyz, 0);
                        real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
                        if (r2 < CUTOFF_SQUARED) {
                            real invR = RSQRT(r2);
@@ -662,7 +670,7 @@ __kernel void computeGBSAForce1(
                            real expTerm = EXP(-D_ij);
                            real denominator2 = r2 + alpha2_ij*expTerm;
                            real denominator = SQRT(denominator2);
-                            real scaledChargeProduct = PREFACTOR*posq1.w*posq2.w;
+                            real scaledChargeProduct = PREFACTOR*charge1*charge2;
                            real tempEnergy = scaledChargeProduct*RECIP(denominator);
                            real Gpol = tempEnergy*RECIP(denominator2);
                            real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
@@ -694,8 +702,9 @@ __kernel void computeGBSAForce1(
                for (j = 0; j < TILE_SIZE; j++) {
                    int atom2 = atomIndices[tbx+tj];
                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-                        real4 posq2 = (real4) (localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z, localData[tbx+tj].q);
+                        real3 pos2 = (real3) (localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z);
-                        real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
+                        real charge2 = localData[tbx+tj].q;
+                        real4 delta = (real4) (pos2 - posq1.xyz, 0);
 #ifdef USE_PERIODIC
                        APPLY_PERIODIC_TO_DELTA(delta)
 #endif
@@ -711,7 +720,7 @@ __kernel void computeGBSAForce1(
                            real expTerm = EXP(-D_ij);
                            real denominator2 = r2 + alpha2_ij*expTerm;
                            real denominator = SQRT(denominator2);
-                            real scaledChargeProduct = PREFACTOR*posq1.w*posq2.w;
+                            real scaledChargeProduct = PREFACTOR*charge1*charge2;
                            real tempEnergy = scaledChargeProduct*RECIP(denominator);
                            real Gpol = tempEnergy*RECIP(denominator2);
                            real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);

--- a/platforms/opencl/src/kernels/gbsaObc2.cl
+++ b/platforms/opencl/src/kernels/gbsaObc2.cl
 {
    real invRSquaredOver4 = 0.25f*invR*invR;
-    real rScaledRadiusJ = r+obcParams2.y;
+    real rScaledRadiusJ = r+OBC_PARAMS2.y;
-    real rScaledRadiusI = r+obcParams1.y;
+    real rScaledRadiusI = r+OBC_PARAMS1.y;
-    real l_ijJ = RECIP(max((real) obcParams1.x, fabs(r-obcParams2.y)));
+    real l_ijJ = RECIP(max((real) OBC_PARAMS1.x, fabs(r-OBC_PARAMS2.y)));
-    real l_ijI = RECIP(max((real) obcParams2.x, fabs(r-obcParams1.y)));
+    real l_ijI = RECIP(max((real) OBC_PARAMS2.x, fabs(r-OBC_PARAMS1.y)));
    real u_ijJ = RECIP(rScaledRadiusJ);
    real u_ijI = RECIP(rScaledRadiusI);
    real l_ij2J = l_ijJ*l_ijJ;
@@ -14,10 +14,10 @@
    real t1I = LOG(u_ijI*RECIP(l_ijI));
    real t2J = (l_ij2J-u_ij2J);
    real t2I = (l_ij2I-u_ij2I);
-    real term1 = (0.5f*(0.25f+obcParams2.y*obcParams2.y*invRSquaredOver4)*t2J + t1J*invRSquaredOver4)*invR;
+    real term1 = (0.5f*(0.25f+OBC_PARAMS2.y*OBC_PARAMS2.y*invRSquaredOver4)*t2J + t1J*invRSquaredOver4)*invR;
-    real term2 = (0.5f*(0.25f+obcParams1.y*obcParams1.y*invRSquaredOver4)*t2I + t1I*invRSquaredOver4)*invR;
+    real term2 = (0.5f*(0.25f+OBC_PARAMS1.y*OBC_PARAMS1.y*invRSquaredOver4)*t2I + t1I*invRSquaredOver4)*invR;
-    real tempdEdR = (obcParams1.x < rScaledRadiusJ ? bornForce1*term1 : (real) 0);
+    real tempdEdR = (OBC_PARAMS1.x < rScaledRadiusJ ? BORN_FORCE1*term1 : (real) 0);
-    tempdEdR += (obcParams2.x < rScaledRadiusI ? bornForce2*term2 : (real) 0);
+    tempdEdR += (OBC_PARAMS2.x < rScaledRadiusI ? BORN_FORCE2*term2 : (real) 0);
 #ifdef USE_CUTOFF
    bool includeInteraction = (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2 && r2 < CUTOFF_SQUARED);
 #else

--- a/platforms/opencl/src/kernels/gbsaObc_cpu.cl
+++ b/platforms/opencl/src/kernels/gbsaObc_cpu.cl
@@ -18,7 +18,7 @@ __kernel void computeBornSum(
 #else
        __global real* restrict global_bornSum,
 #endif
-        __global const real4* restrict posq, __global const float2* restrict global_params,
+        __global const real4* restrict posq, __global const real* restrict charge, __global const float2* restrict global_params,
 #ifdef USE_CUTOFF
        __global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter,
@@ -46,7 +46,7 @@ __kernel void computeBornSum(
            localData[localAtomIndex].x = tempPosq.x;
            localData[localAtomIndex].y = tempPosq.y;
            localData[localAtomIndex].z = tempPosq.z;
-            localData[localAtomIndex].q = tempPosq.w;
+            localData[localAtomIndex].q = charge[j];
            float2 tempParams = global_params[j];
            localData[localAtomIndex].radius = tempParams.x;
            localData[localAtomIndex].scaledRadius = tempParams.y;
@@ -60,8 +60,9 @@ __kernel void computeBornSum(
                real4 posq1 = posq[atom1];
                float2 params1 = global_params[atom1];
                for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                    real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
+                    real3 pos2 = (real3) (localData[j].x, localData[j].y, localData[j].z);
-                    real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
+                    real charge2 = localData[j].q;
+                    real4 delta = (real4) (pos2 - posq1.xyz, 0);
 #ifdef USE_PERIODIC
                    APPLY_PERIODIC_TO_DELTA(delta)
 #endif
@@ -109,8 +110,9 @@ __kernel void computeBornSum(
                real4 posq1 = posq[atom1];
                float2 params1 = global_params[atom1];
                for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                    real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
+                    real3 pos2 = (real3) (localData[j].x, localData[j].y, localData[j].z);
-                    real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
+                    real charge2 = localData[j].q;
+                    real4 delta = (real4) (pos2 - posq1.xyz, 0);
 #ifdef USE_PERIODIC
                    APPLY_PERIODIC_TO_DELTA(delta)
 #endif
@@ -238,7 +240,7 @@ __kernel void computeBornSum(
                    localData[localAtomIndex].x = tempPosq.x;
                    localData[localAtomIndex].y = tempPosq.y;
                    localData[localAtomIndex].z = tempPosq.z;
-                    localData[localAtomIndex].q = tempPosq.w;
+                    localData[localAtomIndex].q = charge[j];
                    float2 tempParams = global_params[j];
                    localData[localAtomIndex].radius = tempParams.x;
                    localData[localAtomIndex].scaledRadius = tempParams.y;
@@ -261,8 +263,9 @@ __kernel void computeBornSum(
                    APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
                    float2 params1 = global_params[atom1];
                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                        real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
+                        real3 pos2 = (real3) (localData[j].x, localData[j].y, localData[j].z);
-                        real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
+                        real charge2 = localData[j].q;
+                        real4 delta = (real4) (pos2 - posq1.xyz, 0);
                        real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
                        int atom2 = atomIndices[j];
                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
@@ -316,8 +319,9 @@ __kernel void computeBornSum(
                    real4 posq1 = posq[atom1];
                    float2 params1 = global_params[atom1];
                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                        real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
+                        real3 pos2 = (real3) (localData[j].x, localData[j].y, localData[j].z);
-                        real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
+                        real charge2 = localData[j].q;
+                        real4 delta = (real4) (pos2 - posq1.xyz, 0);
 #ifdef USE_PERIODIC
                        APPLY_PERIODIC_TO_DELTA(delta)
 #endif
@@ -407,7 +411,8 @@ __kernel void computeGBSAForce1(
 #else
        __global real4* restrict forceBuffers, __global real* restrict global_bornForce,
 #endif
-        __global mixed* restrict energyBuffer, __global const real4* restrict posq, __global const real* restrict global_bornRadii, int needEnergy,
+        __global mixed* restrict energyBuffer, __global const real4* restrict posq, __global const real* restrict charge,
+        __global const real* restrict global_bornRadii, int needEnergy,
 #ifdef USE_CUTOFF
        __global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter,
@@ -436,7 +441,7 @@ __kernel void computeGBSAForce1(
            localData[localAtomIndex].x = tempPosq.x;
            localData[localAtomIndex].y = tempPosq.y;
            localData[localAtomIndex].z = tempPosq.z;
-            localData[localAtomIndex].q = tempPosq.w;
+            localData[localAtomIndex].q = charge[j];
            localData[localAtomIndex].bornRadius = global_bornRadii[j];
        }
        if (x == y) {
@@ -446,10 +451,12 @@ __kernel void computeGBSAForce1(
                unsigned int atom1 = x*TILE_SIZE+tgx;
                real4 force = 0;
                real4 posq1 = posq[atom1];
+                real charge1 = charge[atom1];
                real bornRadius1 = global_bornRadii[atom1];
                for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                    real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
+                    real3 pos2 = (real3) (localData[j].x, localData[j].y, localData[j].z);
-                    real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
+                    real charge2 = localData[j].q;
+                    real4 delta = (real4) (pos2 - posq1.xyz, 0);
 #ifdef USE_PERIODIC
                    APPLY_PERIODIC_TO_DELTA(delta)
 #endif
@@ -467,7 +474,7 @@ __kernel void computeGBSAForce1(
                        real expTerm = EXP(-D_ij);
                        real denominator2 = r2 + alpha2_ij*expTerm;
                        real denominator = SQRT(denominator2);
-                        real scaledChargeProduct = PREFACTOR*posq1.w*posq2.w;
+                        real scaledChargeProduct = PREFACTOR*charge1*charge2;
                        real tempEnergy = scaledChargeProduct*RECIP(denominator);
                        real Gpol = tempEnergy*RECIP(denominator2);
                        real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
@@ -510,10 +517,12 @@ __kernel void computeGBSAForce1(
                unsigned int atom1 = x*TILE_SIZE+tgx;
                real4 force = 0;
                real4 posq1 = posq[atom1];
+                real charge1 = charge[atom1];
                real bornRadius1 = global_bornRadii[atom1];
                for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                    real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
+                    real3 pos2 = (real3) (localData[j].x, localData[j].y, localData[j].z);
-                    real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
+                    real charge2 = localData[j].q;
+                    real4 delta = (real4) (pos2 - posq1.xyz, 0);
 #ifdef USE_PERIODIC
                    APPLY_PERIODIC_TO_DELTA(delta)
 #endif
@@ -531,7 +540,7 @@ __kernel void computeGBSAForce1(
                        real expTerm = EXP(-D_ij);
                        real denominator2 = r2 + alpha2_ij*expTerm;
                        real denominator = SQRT(denominator2);
-                        real scaledChargeProduct = PREFACTOR*posq1.w*posq2.w;
+                        real scaledChargeProduct = PREFACTOR*charge1*charge2;
                        real tempEnergy = scaledChargeProduct*RECIP(denominator);
                        real Gpol = tempEnergy*RECIP(denominator2);
                        real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
@@ -651,7 +660,7 @@ __kernel void computeGBSAForce1(
                    localData[localAtomIndex].x = tempPosq.x;
                    localData[localAtomIndex].y = tempPosq.y;
                    localData[localAtomIndex].z = tempPosq.z;
-                    localData[localAtomIndex].q = tempPosq.w;
+                    localData[localAtomIndex].q = charge[j];
                    localData[localAtomIndex].bornRadius = global_bornRadii[j];
                    localData[localAtomIndex].fx = 0.0f;
                    localData[localAtomIndex].fy = 0.0f;
@@ -672,11 +681,13 @@ __kernel void computeGBSAForce1(
                    unsigned int atom1 = x*TILE_SIZE+tgx;
                    real4 force = 0;
                    real4 posq1 = posq[atom1];
+                    real charge1 = charge[atom1];
                    APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
                    float bornRadius1 = global_bornRadii[atom1];
                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                        real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
+                        real3 pos2 = (real3) (localData[j].x, localData[j].y, localData[j].z);
-                        real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
+                        real charge2 = localData[j].q;
+                        real4 delta = (real4) (pos2 - posq1.xyz, 0);
                        real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
                        int atom2 = atomIndices[j];
                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
@@ -688,7 +699,7 @@ __kernel void computeGBSAForce1(
                            real expTerm = EXP(-D_ij);
                            real denominator2 = r2 + alpha2_ij*expTerm;
                            real denominator = SQRT(denominator2);
-                            real scaledChargeProduct = PREFACTOR*posq1.w*posq2.w;
+                            real scaledChargeProduct = PREFACTOR*charge1*charge2;
                            real tempEnergy = scaledChargeProduct*RECIP(denominator);
                            real Gpol = tempEnergy*RECIP(denominator2);
                            real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
@@ -730,10 +741,12 @@ __kernel void computeGBSAForce1(
                    unsigned int atom1 = x*TILE_SIZE+tgx;
                    real4 force = 0;
                    real4 posq1 = posq[atom1];
+                    real charge1 = charge[atom1];
                    float bornRadius1 = global_bornRadii[atom1];
                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                        real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
+                        real3 pos2 = (real3) (localData[j].x, localData[j].y, localData[j].z);
-                        real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
+                        real charge2 = localData[j].q;
+                        real4 delta = (real4) (pos2 - posq1.xyz, 0);
 #ifdef USE_PERIODIC
                        APPLY_PERIODIC_TO_DELTA(delta)
 #endif
@@ -752,7 +765,7 @@ __kernel void computeGBSAForce1(
                            real expTerm = EXP(-D_ij);
                            real denominator2 = r2 + alpha2_ij*expTerm;
                            real denominator = SQRT(denominator2);
-                            real scaledChargeProduct = PREFACTOR*posq1.w*posq2.w;
+                            real scaledChargeProduct = PREFACTOR*charge1*charge2;
                            real tempEnergy = scaledChargeProduct*RECIP(denominator);
                            real Gpol = tempEnergy*RECIP(denominator2);
                            real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);