Merge pull request #1 from peastman/ljpme

Cleanup to LJ PME code

Merge pull request #1 from peastman/ljpme
Cleanup to LJ PME code
3b6925ae · Andy Simmonett · GitHub · 5a8a8aa9 · f7a102fb · 3b6925ae
Commit 3b6925ae authored Jan 26, 2017 by Andy Simmonett Committed by GitHub Jan 26, 2017
20 changed files
--- a/platforms/opencl/src/OpenCLSort.cpp
+++ b/platforms/opencl/src/OpenCLSort.cpp
@@ -124,8 +124,9 @@ void OpenCLSort::sort(OpenCLArray& data) {
        computeRangeKernel.setArg<cl_uint>(1, data.getSize());
        computeRangeKernel.setArg<cl::Buffer>(2, dataRange->getDeviceBuffer());
        computeRangeKernel.setArg(3, rangeKernelSize*trait->getKeySize(), NULL);
-        computeRangeKernel.setArg<cl_int>(4, numBuckets);
-        computeRangeKernel.setArg<cl::Buffer>(5, bucketOffset->getDeviceBuffer());
+        computeRangeKernel.setArg(4, rangeKernelSize*trait->getKeySize(), NULL);
+        computeRangeKernel.setArg<cl_int>(5, numBuckets);
+        computeRangeKernel.setArg<cl::Buffer>(6, bucketOffset->getDeviceBuffer());
        context.executeKernel(computeRangeKernel, rangeKernelSize, rangeKernelSize);

        // Assign array elements to buckets.

--- a/platforms/opencl/src/kernels/customGBEnergyN2.cl
+++ b/platforms/opencl/src/kernels/customGBEnergyN2.cl
@@ -18,7 +18,7 @@ __kernel void computeN2Energy(
 #endif
        __global mixed* restrict energyBuffer, __local real4* restrict local_force,
        __global const real4* restrict posq, __local real4* restrict local_posq, __global const unsigned int* restrict exclusions,
-        __global const ushort2* exclusionTiles,
+        __global const ushort2* exclusionTiles, int needEnergy,
 #ifdef USE_CUTOFF
        __global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter,
@@ -82,7 +82,8 @@ __kernel void computeN2Energy(
                        COMPUTE_INTERACTION
                        dEdR /= -r;
                    }
-                    energy += 0.5f*tempEnergy;
+                    if (needEnergy)
+                        energy += 0.5f*tempEnergy;
                    delta.xyz *= dEdR;
                    force.xyz -= delta.xyz;
 #ifdef USE_CUTOFF
@@ -133,7 +134,8 @@ __kernel void computeN2Energy(
                        COMPUTE_INTERACTION
                        dEdR /= -r;
                    }
-                    energy += tempEnergy;
+                    if (needEnergy)
+                        energy += tempEnergy;
                    delta.xyz *= dEdR;
                    force.xyz -= delta.xyz;
                    atom2 = tbx+tj;
@@ -250,7 +252,7 @@ __kernel void computeN2Energy(
            LOAD_ATOM1_PARAMETERS
            const unsigned int localAtomIndex = get_local_id(0);
 #ifdef USE_CUTOFF
-            unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+tgx] : y*TILE_SIZE + tgx);
+            unsigned int j = interactingAtoms[pos*TILE_SIZE+tgx];
 #else
            unsigned int j = y*TILE_SIZE + tgx;
 #endif
@@ -289,7 +291,8 @@ __kernel void computeN2Energy(
                            COMPUTE_INTERACTION
                            dEdR /= -r;
                        }
-                        energy += tempEnergy;
+                        if (needEnergy)
+                            energy += tempEnergy;
                        delta.xyz *= dEdR;
                        force.xyz -= delta.xyz;
                        atom2 = tbx+tj;
@@ -328,7 +331,8 @@ __kernel void computeN2Energy(
                            COMPUTE_INTERACTION
                            dEdR /= -r;
                        }
-                        energy += tempEnergy;
+                        if (needEnergy)
+                            energy += tempEnergy;
                        delta.xyz *= dEdR;
                        force.xyz -= delta.xyz;
                        atom2 = tbx+tj;

--- a/platforms/opencl/src/kernels/customGBEnergyN2_cpu.cl
+++ b/platforms/opencl/src/kernels/customGBEnergyN2_cpu.cl
@@ -18,7 +18,7 @@ __kernel void computeN2Energy(
 #endif
        __global mixed* restrict energyBuffer, __local real4* restrict local_force,
        __global const real4* restrict posq, __local real4* restrict local_posq, __global const unsigned int* restrict exclusions,
-        __global const ushort2* exclusionTiles,
+        __global const ushort2* exclusionTiles, int needEnergy,
 #ifdef USE_CUTOFF
        __global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter,
@@ -255,7 +255,7 @@ __kernel void computeN2Energy(

            for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
 #ifdef USE_CUTOFF
-                unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+localAtomIndex] : y*TILE_SIZE+localAtomIndex);
+                unsigned int j = interactingAtoms[pos*TILE_SIZE+localAtomIndex];
 #else
                unsigned int j = y*TILE_SIZE+localAtomIndex;
 #endif

--- a/platforms/opencl/src/kernels/customGBValueN2.cl
+++ b/platforms/opencl/src/kernels/customGBValueN2.cl
@@ -229,7 +229,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
            LOAD_ATOM1_PARAMETERS
            const unsigned int localAtomIndex = get_local_id(0);
 #ifdef USE_CUTOFF
-            unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+tgx] : y*TILE_SIZE + tgx);
+            unsigned int j = interactingAtoms[pos*TILE_SIZE+tgx];
 #else
            unsigned int j = y*TILE_SIZE + tgx;
 #endif

--- a/platforms/opencl/src/kernels/customGBValueN2_cpu.cl
+++ b/platforms/opencl/src/kernels/customGBValueN2_cpu.cl
@@ -228,7 +228,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*

            for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
 #ifdef USE_CUTOFF
-                unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+localAtomIndex] : y*TILE_SIZE+localAtomIndex);
+                unsigned int j = interactingAtoms[pos*TILE_SIZE+localAtomIndex];
 #else
                unsigned int j = y*TILE_SIZE+localAtomIndex;
 #endif

--- a/platforms/opencl/src/kernels/customIntegratorPerDof.cl
+++ b/platforms/opencl/src/kernels/customIntegratorPerDof.cl
@@ -26,7 +26,7 @@ void storePos(__global real4* restrict posq, __global real4* restrict posqCorrec
 __kernel void computePerDof(__global real4* restrict posq, __global real4* restrict posqCorrection, __global mixed4* restrict posDelta,
        __global mixed4* restrict velm, __global const real4* restrict force, __global const mixed2* restrict dt, __global const mixed* restrict globals,
        __global mixed* restrict sum, __global const float4* restrict gaussianValues, unsigned int gaussianBaseIndex, __global const float4* restrict uniformValues,
-        const real energy, __global mixed* restrict energyParamDerivs
+        const mixed energy, __global mixed* restrict energyParamDerivs
        PARAMETER_ARGUMENTS) {
    mixed stepSize = dt[0].y;
    int index = get_global_id(0);

--- a/platforms/opencl/src/kernels/customNonbonded.cl
+++ b/platforms/opencl/src/kernels/customNonbonded.cl
@@ -14,8 +14,10 @@ if (!isExcluded) {
 #endif
    COMPUTE_FORCE
 #if USE_SWITCH
-    tempForce = tempForce*switchValue - tempEnergy*switchDeriv;
-    tempEnergy *= switchValue;
+    tempForce = tempForce*switchValue - customEnergy*switchDeriv;
+    tempEnergy += customEnergy*switchValue;
+#else
+    tempEnergy += customEnergy;
 #endif
    dEdR += tempForce*invR;
 }
--- a/platforms/opencl/src/kernels/findInteractingBlocks.cl
+++ b/platforms/opencl/src/kernels/findInteractingBlocks.cl
@@ -27,8 +27,19 @@ __kernel void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeri
            maxPos = max(maxPos, pos);
        }
        real4 blockSize = 0.5f*(maxPos-minPos);
+        real4 center = 0.5f*(maxPos+minPos);
+        center.w = 0;
+        for (int i = base; i < last; i++) {
+            pos = posq[i];
+            real4 delta = posq[i]-center;
+#ifdef USE_PERIODIC
+            APPLY_PERIODIC_TO_DELTA(delta)
+#endif
+            center.w = max(center.w, delta.x*delta.x+delta.y*delta.y+delta.z*delta.z);
+        }
+        center.w = sqrt(center.w);
        blockBoundingBox[index] = blockSize;
-        blockCenter[index] = 0.5f*(maxPos+minPos);
+        blockCenter[index] = center;
        sortedBlocks[index] = (real2) (blockSize.x+blockSize.y+blockSize.z, index);
        index += get_global_size(0);
        base = index*TILE_SIZE;
@@ -88,13 +99,13 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi
    __local real3 posBuffer[GROUP_SIZE];
    __local volatile int workgroupTileIndex[GROUP_SIZE/32];
    __local bool includeBlockFlags[GROUP_SIZE];
-    __local short2 atomCountBuffer[GROUP_SIZE];
+    __local volatile short2 atomCountBuffer[GROUP_SIZE];
    __local int* buffer = workgroupBuffer+BUFFER_SIZE*(warpStart/32);
    __local int* exclusionsForX = warpExclusions+MAX_EXCLUSIONS*(warpStart/32);
    __local volatile int* tileStartIndex = workgroupTileIndex+(warpStart/32);

    // Loop over blocks.
-    
+
    for (int block1 = startBlockIndex+warpIndex; block1 < startBlockIndex+numBlocks; block1 += totalWarps) {
        // Load data for this block.  Note that all threads in a warp are processing the same block.
        
@@ -142,10 +153,18 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi
 #ifdef USE_PERIODIC
                APPLY_PERIODIC_TO_DELTA(blockDelta)
 #endif
+                includeBlock2 &= (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < (PADDED_CUTOFF+blockCenterX.w+blockCenterY.w)*(PADDED_CUTOFF+blockCenterX.w+blockCenterY.w));
                blockDelta.x = max((real) 0, fabs(blockDelta.x)-blockSizeX.x-blockSizeY.x);
                blockDelta.y = max((real) 0, fabs(blockDelta.y)-blockSizeX.y-blockSizeY.y);
                blockDelta.z = max((real) 0, fabs(blockDelta.z)-blockSizeX.z-blockSizeY.z);
                includeBlock2 &= (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < PADDED_CUTOFF_SQUARED);
+#ifdef TRICLINIC
+                // The calculation to find the nearest periodic copy is only guaranteed to work if the nearest copy is less than half a box width away.
+                // If there's any possibility we might have missed it, do a detailed check.
+
+                if (periodicBoxSize.z/2-blockSizeX.z-blockSizeY.z < PADDED_CUTOFF || periodicBoxSize.y/2-blockSizeX.y-blockSizeY.y < PADDED_CUTOFF)
+                    includeBlock2 = true;
+#endif
                if (includeBlock2) {
                    unsigned short y = (unsigned short) sortedBlocks[block2].y;
                    for (int k = 0; k < numExclusions; k++)
@@ -165,8 +184,7 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi

                    // Check each atom in block Y for interactions.

-                    int start = y*TILE_SIZE;
-                    int atom2 = start+indexInWarp;
+                    int atom2 = y*TILE_SIZE+indexInWarp;
                    real3 pos2 = posq[atom2].xyz;
 #ifdef USE_PERIODIC
                    if (singlePeriodicCopy)

--- a/platforms/opencl/src/kernels/gbsaObc.cl
+++ b/platforms/opencl/src/kernels/gbsaObc.cl
@@ -232,7 +232,7 @@ __kernel void computeBornSum(
            real4 posq1 = posq[atom1];
            float2 params1 = global_params[atom1];
 #ifdef USE_CUTOFF
-            unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+tgx] : y*TILE_SIZE + tgx);
+            unsigned int j = interactingAtoms[pos*TILE_SIZE+tgx];
 #else
            unsigned int j = y*TILE_SIZE + tgx;
 #endif
@@ -385,7 +385,7 @@ __kernel void computeGBSAForce1(
 #else
        __global real4* restrict forceBuffers, __global real* restrict global_bornForce,
 #endif
-        __global mixed* restrict energyBuffer, __global const real4* restrict posq, __global const real* restrict global_bornRadii,
+        __global mixed* restrict energyBuffer, __global const real4* restrict posq, __global const real* restrict global_bornRadii, int needEnergy,
 #ifdef USE_CUTOFF
        __global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, 
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter,
@@ -452,7 +452,8 @@ __kernel void computeGBSAForce1(
                        if (atom1 != y*TILE_SIZE+j)
                            tempEnergy -= scaledChargeProduct/CUTOFF;
 #endif
-                        energy += 0.5f*tempEnergy;
+                        if (needEnergy)
+                            energy += 0.5f*tempEnergy;
                        delta.xyz *= dEdR;
                        force.xyz -= delta.xyz;
 #ifdef USE_CUTOFF
@@ -506,7 +507,8 @@ __kernel void computeGBSAForce1(
 #ifdef USE_CUTOFF
                        tempEnergy -= scaledChargeProduct/CUTOFF;
 #endif
-                        energy += tempEnergy;
+                        if (needEnergy)
+                            energy += tempEnergy;
                        delta.xyz *= dEdR;
                        force.xyz -= delta.xyz;
                        localData[tbx+tj].fx += delta.x;
@@ -617,7 +619,7 @@ __kernel void computeGBSAForce1(
            real4 posq1 = posq[atom1];
            real bornRadius1 = global_bornRadii[atom1];
 #ifdef USE_CUTOFF
-            unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+tgx] : y*TILE_SIZE + tgx);
+            unsigned int j = interactingAtoms[pos*TILE_SIZE+tgx];
 #else
            unsigned int j = y*TILE_SIZE + tgx;
 #endif
@@ -669,7 +671,8 @@ __kernel void computeGBSAForce1(
 #ifdef USE_CUTOFF
                            tempEnergy -= scaledChargeProduct/CUTOFF;
 #endif
-                            energy += tempEnergy;
+                            if (needEnergy)
+                                energy += tempEnergy;
                            delta.xyz *= dEdR;
                            force.xyz -= delta.xyz;
                            localData[tbx+tj].fx += delta.x;
@@ -717,7 +720,8 @@ __kernel void computeGBSAForce1(
 #ifdef USE_CUTOFF
                            tempEnergy -= scaledChargeProduct/CUTOFF;
 #endif
-                            energy += tempEnergy;
+                            if (needEnergy)
+                                energy += tempEnergy;
                            delta.xyz *= dEdR;
                            force.xyz -= delta.xyz;
                            localData[tbx+tj].fx += delta.x;

--- a/platforms/opencl/src/kernels/gbsaObc_cpu.cl
+++ b/platforms/opencl/src/kernels/gbsaObc_cpu.cl
@@ -228,7 +228,7 @@ __kernel void computeBornSum(

            for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
 #ifdef USE_CUTOFF
-                unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+localAtomIndex] : y*TILE_SIZE+localAtomIndex);
+                unsigned int j = interactingAtoms[pos*TILE_SIZE+localAtomIndex];
 #else
                unsigned int j = y*TILE_SIZE+localAtomIndex;
 #endif
@@ -407,7 +407,7 @@ __kernel void computeGBSAForce1(
 #else
        __global real4* restrict forceBuffers, __global real* restrict global_bornForce,
 #endif
-        __global mixed* restrict energyBuffer, __global const real4* restrict posq, __global const real* restrict global_bornRadii,
+        __global mixed* restrict energyBuffer, __global const real4* restrict posq, __global const real* restrict global_bornRadii, int needEnergy,
 #ifdef USE_CUTOFF
        __global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter,
@@ -641,7 +641,7 @@ __kernel void computeGBSAForce1(

            for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
 #ifdef USE_CUTOFF
-                unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+localAtomIndex] : y*TILE_SIZE+localAtomIndex);
+                unsigned int j = interactingAtoms[pos*TILE_SIZE+localAtomIndex];
 #else
                unsigned int j = y*TILE_SIZE+localAtomIndex;
 #endif

--- a/platforms/opencl/src/kernels/langevin.cl
+++ b/platforms/opencl/src/kernels/langevin.cl
@@ -72,7 +72,7 @@ __kernel void integrateLangevinPart2(__global real4* restrict posq, __global rea
 * Select the step size to use for the next step.
 */

-__kernel void selectLangevinStepSize(mixed maxStepSize, mixed errorTol, mixed tau, mixed kT, __global mixed2* restrict dt,
+__kernel void selectLangevinStepSize(mixed maxStepSize, mixed errorTol, mixed friction, mixed kT, __global mixed2* restrict dt,
        __global const mixed4* restrict velm, __global const real4* restrict force, __global mixed* restrict paramBuffer, __local mixed* restrict params, __local mixed* restrict error) {
    // Calculate the error.

@@ -110,9 +110,9 @@ __kernel void selectLangevinStepSize(mixed maxStepSize, mixed errorTol, mixed ta

        // Recalculate the integration parameters.

-        mixed vscale = exp(-newStepSize/tau);
-        mixed fscale = (1-vscale)*tau;
-        mixed noisescale = sqrt(2*kT/tau)*sqrt(0.5f*(1-vscale*vscale)*tau);
+        mixed vscale = exp(-newStepSize*friction);
+        mixed fscale = (friction == 0 ? newStepSize : (1-vscale)/friction);
+        mixed noisescale = sqrt(kT*(1-vscale*vscale));
        params[VelScale] = vscale;
        params[ForceScale] = fscale;
        params[NoiseScale] = noisescale;

--- a/platforms/opencl/src/kernels/nonbonded.cl
+++ b/platforms/opencl/src/kernels/nonbonded.cl
@@ -269,7 +269,7 @@ __kernel void computeNonbonded(
            LOAD_ATOM1_PARAMETERS
            const unsigned int localAtomIndex = get_local_id(0);
 #ifdef USE_CUTOFF
-            unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+tgx] : y*TILE_SIZE + tgx);
+            unsigned int j = interactingAtoms[pos*TILE_SIZE+tgx];
 #else
            unsigned int j = y*TILE_SIZE + tgx;
 #endif

--- a/platforms/opencl/src/kernels/nonbonded_cpu.cl
+++ b/platforms/opencl/src/kernels/nonbonded_cpu.cl
@@ -269,7 +269,7 @@ __kernel void computeNonbonded(

            for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
 #ifdef USE_CUTOFF
-                unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+localAtomIndex] : y*TILE_SIZE+localAtomIndex);
+                unsigned int j = interactingAtoms[pos*TILE_SIZE+localAtomIndex];
 #else
                unsigned int j = y*TILE_SIZE+localAtomIndex;
 #endif

--- a/platforms/opencl/src/kernels/sort.cl
+++ b/platforms/opencl/src/kernels/sort.cl
@@ -8,7 +8,7 @@ KEY_TYPE getValue(DATA_TYPE value) {
 * Sort a list that is short enough to entirely fit in local memory.  This is executed as
 * a single thread block.
 */
-__kernel void sortShortList(__global DATA_TYPE* __restrict__ data, uint length, __local DATA_TYPE* dataBuffer) {
+__kernel void sortShortList(__global DATA_TYPE* restrict data, uint length, __local DATA_TYPE* dataBuffer) {
    // Load the data into local memory.
    
    for (int index = get_local_id(0); index < length; index += get_local_size(0))
@@ -49,8 +49,8 @@ __kernel void sortShortList(__global DATA_TYPE* __restrict__ data, uint length,
 * Calculate the minimum and maximum value in the array to be sorted.  This kernel
 * is executed as a single work group.
 */
-__kernel void computeRange(__global const DATA_TYPE* restrict data, uint length, __global KEY_TYPE* restrict range, __local KEY_TYPE* restrict buffer,
-        uint numBuckets, __global uint* restrict bucketOffset) {
+__kernel void computeRange(__global const DATA_TYPE* restrict data, uint length, __global KEY_TYPE* restrict range, __local KEY_TYPE* restrict minBuffer,
+        __local KEY_TYPE* restrict maxBuffer, uint numBuckets, __global uint* restrict bucketOffset) {
    KEY_TYPE minimum = MAX_KEY;
    KEY_TYPE maximum = MIN_KEY;

@@ -64,23 +64,18 @@ __kernel void computeRange(__global const DATA_TYPE* restrict data, uint length,

    // Now reduce them.

-    buffer[get_local_id(0)] = minimum;
+    minBuffer[get_local_id(0)] = minimum;
+    maxBuffer[get_local_id(0)] = maximum;
    barrier(CLK_LOCAL_MEM_FENCE);
    for (uint step = 1; step < get_local_size(0); step *= 2) {
-        if (get_local_id(0)+step < get_local_size(0) && get_local_id(0)%(2*step) == 0)
-            buffer[get_local_id(0)] = min(buffer[get_local_id(0)], buffer[get_local_id(0)+step]);
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    minimum = buffer[0];
-    barrier(CLK_LOCAL_MEM_FENCE);
-    buffer[get_local_id(0)] = maximum;
-    barrier(CLK_LOCAL_MEM_FENCE);
-    for (uint step = 1; step < get_local_size(0); step *= 2) {
-        if (get_local_id(0)+step < get_local_size(0) && get_local_id(0)%(2*step) == 0)
-            buffer[get_local_id(0)] = max(buffer[get_local_id(0)], buffer[get_local_id(0)+step]);
+        if (get_local_id(0)+step < get_local_size(0) && get_local_id(0)%(2*step) == 0) {
+            minBuffer[get_local_id(0)] = min(minBuffer[get_local_id(0)], minBuffer[get_local_id(0)+step]);
+            maxBuffer[get_local_id(0)] = max(maxBuffer[get_local_id(0)], maxBuffer[get_local_id(0)+step]);
+        }
        barrier(CLK_LOCAL_MEM_FENCE);
    }
-    maximum = buffer[0];
+    minimum = minBuffer[0];
+    maximum = maxBuffer[0];
    if (get_local_id(0) == 0) {
        range[0] = minimum;
        range[1] = maximum;
@@ -96,7 +91,7 @@ __kernel void computeRange(__global const DATA_TYPE* restrict data, uint length,
 * Assign elements to buckets.
 */
 __kernel void assignElementsToBuckets(__global const DATA_TYPE* restrict data, uint length, uint numBuckets, __global const KEY_TYPE* restrict range,
-        __global uint* bucketOffset, __global uint* restrict bucketOfElement, __global uint* restrict offsetInBucket) {
+        __global uint* restrict bucketOffset, __global uint* restrict bucketOfElement, __global uint* restrict offsetInBucket) {
 #ifdef AMD_ATOMIC_WORK_AROUND
    // Do a byte write to force all memory accesses to interactionCount to use the complete path.
    // This avoids the atomic access from causing all word accesses to other buffers from using the slow complete path.

--- a/platforms/reference/include/ReferenceCustomDynamics.h
+++ b/platforms/reference/include/ReferenceCustomDynamics.h
@@ -51,13 +51,14 @@ private:
    std::vector<CustomIntegratorUtilities::Comparison> comparisons;
    std::vector<bool> invalidatesForces, needsForces, needsEnergy, computeBothForceAndEnergy;
    std::vector<int> forceGroupFlags, blockEnd;
-    RealOpenMM energy;
    std::map<std::string, double> energyParamDerivs;
    Lepton::CompiledExpression kineticEnergyExpression;
    bool kineticEnergyNeedsForce;
    CompiledExpressionSet expressionSet;
-    int xIndex, vIndex, mIndex, fIndex, energyIndex, gaussianIndex, uniformIndex;
-    std::vector<int> forceVariableIndex, energyVariableIndex, perDofVariableIndex, stepVariableIndex;
+    double x, v, m, f, energy, gaussian, uniform;
+    int xIndex, vIndex;
+    std::vector<int> perDofVariableIndex, stepVariableIndex;
+    std::vector<double> perDofVariable;

    void initialize(OpenMM::ContextImpl& context, std::vector<RealOpenMM>& masses, std::map<std::string, RealOpenMM>& globals);
    
@@ -65,7 +66,7 @@ private:
    
    void computePerDof(int numberOfAtoms, std::vector<OpenMM::RealVec>& results, const std::vector<OpenMM::RealVec>& atomCoordinates,
                  const std::vector<OpenMM::RealVec>& velocities, const std::vector<OpenMM::RealVec>& forces, const std::vector<RealOpenMM>& masses,
-                  const std::vector<std::vector<OpenMM::RealVec> >& perDof, const Lepton::CompiledExpression& expression, int forceIndex);
+                  const std::vector<std::vector<OpenMM::RealVec> >& perDof, const Lepton::CompiledExpression& expression);
    
    void recordChangedParameters(OpenMM::ContextImpl& context, std::map<std::string, RealOpenMM>& globals);


--- a/platforms/reference/include/ReferenceKernels.h
+++ b/platforms/reference/include/ReferenceKernels.h
@@ -604,12 +604,21 @@ public:
     * @param nz      the number of grid points along the Z axis
     */
    void getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
+    /**
+     * Get the dispersion parameters being used for the dispersion term in LJPME.
+     *
+     * @param alpha   the separation parameter
+     * @param nx      the number of grid points along the X axis
+     * @param ny      the number of grid points along the Y axis
+     * @param nz      the number of grid points along the Z axis
+     */
+    void getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const;
 private:
    int numParticles, num14;
    int **bonded14IndexArray;
    RealOpenMM **particleParamArray, **bonded14ParamArray;
-    RealOpenMM nonbondedCutoff, switchingDistance, rfDielectric, ewaldAlpha, dispersionCoefficient;
-    int kmax[3], gridSize[3];
+    RealOpenMM nonbondedCutoff, switchingDistance, rfDielectric, ewaldAlpha, ewaldDispersionAlpha, dispersionCoefficient;
+    int kmax[3], gridSize[3], dispersionGridSize[3];
    bool useSwitchingFunction;
    std::vector<std::set<int> > exclusions;
    NonbondedMethod nonbondedMethod;

--- a/platforms/reference/include/ReferenceLJCoulombIxn.h
+++ b/platforms/reference/include/ReferenceLJCoulombIxn.h
@@ -38,14 +38,14 @@ class ReferenceLJCoulombIxn {
      bool useSwitch;
      bool periodic;
      bool ewald;
-      bool pme;
+      bool pme, ljpme;
      const OpenMM::NeighborList* neighborList;
      OpenMM::RealVec periodicBoxVectors[3];
      RealOpenMM cutoffDistance, switchingDistance;
      RealOpenMM krf, crf;
-      RealOpenMM alphaEwald;
+      RealOpenMM alphaEwald, alphaDispersionEwald;
      int numRx, numRy, numRz;
-      int meshDim[3];
+      int meshDim[3], dispersionMeshDim[3];

      // parameter indices

@@ -139,16 +139,28 @@ class ReferenceLJCoulombIxn {

     
      /**---------------------------------------------------------------------------------------
-      
+
         Set the force to use Particle-Mesh Ewald (PME) summation.
-      
+
         @param alpha    the Ewald separation parameter
         @param gridSize the dimensions of the mesh
-      
+
         --------------------------------------------------------------------------------------- */
-      
+
      void setUsePME(RealOpenMM alpha, int meshSize[3]);
-      
+
+
+      /**---------------------------------------------------------------------------------------
+
+         Set the force to use Particle-Mesh Ewald (PME) summation for dispersion.
+
+         @param dalpha    the dispersion Ewald separation parameter
+         @param dgridSize the dimensions of the dispersion mesh
+
+         --------------------------------------------------------------------------------------- */
+
+      void setUseLJPME(RealOpenMM dalpha, int dmeshSize[3]);
+
      /**---------------------------------------------------------------------------------------
      
         Calculate LJ Coulomb pair ixn

--- a/platforms/reference/include/ReferencePME.h
+++ b/platforms/reference/include/ReferencePME.h
@@ -87,6 +87,28 @@ pme_exec(pme_t       pme,
         RealOpenMM *    energy);


+/*
+ * Evaluate reciprocal space PME dispersion energy and forces.
+ *
+ * Args:
+ *
+ * pme         Opaque pme_t object, must have been initialized with pme_init()
+ * x           Pointer to coordinate data array (nm)
+ * f           Pointer to force data array (will be written as kJ/mol/nm)
+ * c6s         Array of c6 coefficients (units of sqrt(kJ/mol).nm^3 )
+ * box         Simulation cell dimensions (nm)
+ * energy      Total energy (will be written in units of kJ/mol)
+ */
+int OPENMM_EXPORT
+pme_exec_dpme(pme_t       pme,
+              const std::vector<OpenMM::RealVec>& atomCoordinates,
+              std::vector<OpenMM::RealVec>& forces,
+              const std::vector<RealOpenMM>& c6s,
+              const OpenMM::RealVec  periodicBoxVectors[3],
+              RealOpenMM *    energy);
+
+
+

 /* Release all memory in pme structure */
 int OPENMM_EXPORT
@@ -94,4 +116,4 @@ pme_destroy(pme_t    pme);

 } // namespace OpenMM

-#endif // __ReferencePME_H__
\ No newline at end of file
+#endif // __ReferencePME_H__
--- a/platforms/reference/include/ReferencePlatform.h
+++ b/platforms/reference/include/ReferencePlatform.h
@@ -56,7 +56,7 @@ public:
    void contextDestroyed(ContextImpl& context) const;
 };

-class ReferencePlatform::PlatformData {
+class OPENMM_EXPORT ReferencePlatform::PlatformData {
 public:
    PlatformData(const System& system);
    ~PlatformData();

--- a/platforms/reference/include/ReferenceStochasticDynamics.h
+++ b/platforms/reference/include/ReferenceStochasticDynamics.h

-/* Portions copyright (c) 2006-2012 Stanford University and Simbios.
+/* Portions copyright (c) 2006-2016 Stanford University and Simbios.
 * Contributors: Pande Group
 *
 * Permission is hereby granted, free of charge, to any person obtaining
@@ -36,7 +36,7 @@ class OPENMM_EXPORT ReferenceStochasticDynamics : public ReferenceDynamics {

      std::vector<OpenMM::RealVec> xPrime;
      std::vector<RealOpenMM> inverseMasses;
-      RealOpenMM _tau;
+      RealOpenMM friction;
      
   public:

@@ -46,12 +46,12 @@ class OPENMM_EXPORT ReferenceStochasticDynamics : public ReferenceDynamics {

         @param numberOfAtoms  number of atoms
         @param deltaT         delta t for dynamics
-         @param tau            viscosity
+         @param friction       friction coefficient
         @param temperature    temperature
      
         --------------------------------------------------------------------------------------- */

-       ReferenceStochasticDynamics(int numberOfAtoms, RealOpenMM deltaT, RealOpenMM tau, RealOpenMM temperature);
+       ReferenceStochasticDynamics(int numberOfAtoms, RealOpenMM deltaT, RealOpenMM friction, RealOpenMM temperature);

      /**---------------------------------------------------------------------------------------
      
@@ -63,13 +63,11 @@ class OPENMM_EXPORT ReferenceStochasticDynamics : public ReferenceDynamics {

      /**---------------------------------------------------------------------------------------
      
-         Get tau
-      
-         @return tau
+         Get friction coefficient
      
         --------------------------------------------------------------------------------------- */
      
-      RealOpenMM getTau() const;
+      RealOpenMM getFriction() const;
      
      /**---------------------------------------------------------------------------------------