Added "const" and "restrict" to lots of kernel arguments to let the compiler do more optimizations

69e75377 · Peter Eastman · bf8b9f30 · 69e75377 · 69e75377 · 69e75377
Commit 69e75377 authored Oct 07, 2011 by Peter Eastman
18 changed files
--- a/platforms/opencl/src/kernels/gbsaObcReductions.cl
+++ b/platforms/opencl/src/kernels/gbsaObcReductions.cl
@@ -8,11 +8,11 @@

 __kernel void reduceBornSum(int bufferSize, int numBuffers, float alpha, float beta, float gamma,
 #ifdef SUPPORTS_64_BIT_ATOMICS
-            __global long* bornSum,
+            __global const long* restrict bornSum,
 #else
-            __global float* bornSum,
+            __global const float* restrict bornSum,
 #endif
-            __global float2* params, __global float* bornRadii, __global float* obcChain) {
+            __global const float2* restrict params, __global float* restrict bornRadii, __global float* restrict obcChain) {
    unsigned int index = get_global_id(0);
    while (index < NUM_ATOMS) {
        // Get summed Born data
@@ -49,9 +49,9 @@ __kernel void reduceBornSum(int bufferSize, int numBuffers, float alpha, float b

 __kernel void reduceBornForce(int bufferSize, int numBuffers, __global float* bornForce,
 #ifdef SUPPORTS_64_BIT_ATOMICS
-            __global long* bornForceIn,
+            __global const long* restrict bornForceIn,
 #endif
-            __global float* energyBuffer, __global float2* params, __global float* bornRadii, __global float* obcChain) {
+            __global float* restrict energyBuffer, __global const float2* restrict params, __global const float* restrict bornRadii, __global const float* restrict obcChain) {
    float energy = 0.0f;
    unsigned int index = get_global_id(0);
    while (index < NUM_ATOMS) {

--- a/platforms/opencl/src/kernels/gbsaObc_cpu.cl
+++ b/platforms/opencl/src/kernels/gbsaObc_cpu.cl
@@ -14,10 +14,10 @@ typedef struct {
 * Compute the Born sum.
 */

-__kernel void computeBornSum(__global float* global_bornSum, __global float4* posq, __global float2* global_params,
-        __local AtomData* localData, __local float* tempBuffer,
+__kernel void computeBornSum(__global float* restrict global_bornSum, __global const float4* restrict posq, __global const float2* restrict global_params,
+        __local AtomData* restrict localData, __local float* restrict tempBuffer,
 #ifdef USE_CUTOFF
-        __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags) {
+        __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags) {
 #else
        unsigned int numTiles) {
 #endif
@@ -190,11 +190,11 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po
 * First part of computing the GBSA interaction.
 */

-__kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer,
-        __global float4* posq, __global float* global_bornRadii, __global float* global_bornForce,
-        __local AtomData* localData, __local float4* tempBuffer,
+__kernel void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* restrict energyBuffer,
+        __global const float4* restrict posq, __global const float* restrict global_bornRadii, __global float* restrict global_bornForce,
+        __local AtomData* restrict localData, __local float4* restrict tempBuffer,
 #ifdef USE_CUTOFF
-        __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags) {
+        __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags) {
 #else
        unsigned int numTiles) {
 #endif

--- a/platforms/opencl/src/kernels/gbsaObc_default.cl
+++ b/platforms/opencl/src/kernels/gbsaObc_default.cl
@@ -12,10 +12,10 @@ typedef struct {
 */

 __kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
-void computeBornSum(__global float* global_bornSum, __global float4* posq, __global float2* global_params,
-        __local AtomData1* localData, __local float* tempBuffer,
+void computeBornSum(__global float* restrict global_bornSum, __global const float4* restrict posq, __global const float2* restrict global_params,
+        __local AtomData1* restrict localData, __local float* restrict tempBuffer,
 #ifdef USE_CUTOFF
-        __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) {
+        __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) {
 #else
        unsigned int numTiles) {
 #endif
@@ -205,11 +205,11 @@ typedef struct {
 */

 __kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
-void computeGBSAForce1(__global float4* forceBuffers, __global float* global_bornForce,
-        __global float* energyBuffer, __global float4* posq, __global float* global_bornRadii,
-        __local AtomData2* localData, __local float4* tempBuffer,
+void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* restrict global_bornForce,
+        __global float* restrict energyBuffer, __global const float4* restrict posq, __global const float* restrict global_bornRadii,
+        __local AtomData2* restrict localData, __local float4* restrict tempBuffer,
 #ifdef USE_CUTOFF
-        __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) {
+        __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) {
 #else
        unsigned int numTiles) {
 #endif

--- a/platforms/opencl/src/kernels/gbsaObc_nvidia.cl
+++ b/platforms/opencl/src/kernels/gbsaObc_nvidia.cl
@@ -16,14 +16,14 @@ typedef struct {
 */
 __kernel void computeBornSum(
 #ifdef SUPPORTS_64_BIT_ATOMICS
-        __global long* global_bornSum,
+        __global long* restrict global_bornSum,
 #else
-        __global float* global_bornSum,
+        __global float* restrict global_bornSum,
 #endif
-        __global float4* posq, __global float2* global_params,
-        __local AtomData1* localData, __local float* tempBuffer,
+        __global const float4* restrict posq, __global const float2* restrict global_params,
+        __local AtomData1* restrict localData, __local float* restrict tempBuffer,
 #ifdef USE_CUTOFF
-        __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags,
+        __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags,
 #else
        unsigned int numTiles,
 #endif
@@ -337,14 +337,14 @@ typedef struct {

 __kernel void computeGBSAForce1(
 #ifdef SUPPORTS_64_BIT_ATOMICS
-        __global long* forceBuffers, __global long* global_bornForce,
+        __global long* restrict forceBuffers, __global long* restrict global_bornForce,
 #else
-        __global float4* forceBuffers, __global float* global_bornForce,
+        __global float4* restrict forceBuffers, __global float* restrict global_bornForce,
 #endif
-        __global float* energyBuffer, __global float4* posq, __global float* global_bornRadii,
-        __local AtomData2* localData, __local float4* tempBuffer,
+        __global float* restrict energyBuffer, __global const float4* restrict posq, __global const float* restrict global_bornRadii,
+        __local AtomData2* restrict localData, __local float4* restrict tempBuffer,
 #ifdef USE_CUTOFF
-        __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags,
+        __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags,
 #else
        unsigned int numTiles,
 #endif

--- a/platforms/opencl/src/kernels/langevin.cl
+++ b/platforms/opencl/src/kernels/langevin.cl
@@ -8,8 +8,8 @@ enum {VelScale, ForceScale, NoiseScale, MaxParams};
 * Perform the first step of Langevin integration.
 */

-__kernel void integrateLangevinPart1(__global float4* velm, __global float4* force, __global float4* posDelta,
-        __global float* paramBuffer, __global float2* dt, __global float4* random, unsigned int randomIndex) {
+__kernel void integrateLangevinPart1(__global float4* restrict velm, __global const float4* restrict force, __global float4* restrict posDelta,
+        __global const float* restrict paramBuffer, __global const float2* restrict dt, __global const float4* restrict random, unsigned int randomIndex) {
    float vscale = paramBuffer[VelScale];
    float fscale = paramBuffer[ForceScale];
    float noisescale = paramBuffer[NoiseScale];
@@ -31,7 +31,7 @@ __kernel void integrateLangevinPart1(__global float4* velm, __global float4* for
 * Perform the second step of Langevin integration.
 */

-__kernel void integrateLangevinPart2(__global float4* posq, __global float4* posDelta, __global float4* velm, __global float2* dt) {
+__kernel void integrateLangevinPart2(__global float4* restrict posq, __global const float4* restrict posDelta, __global float4* restrict velm, __global const float2* restrict dt) {
 #ifdef cl_khr_fp64
    double invStepSize = 1.0/dt[0].y;
 #else
@@ -58,8 +58,8 @@ __kernel void integrateLangevinPart2(__global float4* posq, __global float4* pos
 * Select the step size to use for the next step.
 */

-__kernel void selectLangevinStepSize(float maxStepSize, float errorTol, float tau, float kT, __global float2* dt,
-        __global float4* velm, __global float4* force, __global float* paramBuffer, __local float* params, __local float* error) {
+__kernel void selectLangevinStepSize(float maxStepSize, float errorTol, float tau, float kT, __global float2* restrict dt,
+        __global const float4* restrict velm, __global const float4* restrict force, __global float* restrict paramBuffer, __local float* restrict params, __local float* restrict error) {
    // Calculate the error.

    float err = 0.0f;

--- a/platforms/opencl/src/kernels/monteCarloBarostat.cl
+++ b/platforms/opencl/src/kernels/monteCarloBarostat.cl
@@ -2,8 +2,8 @@
 * Scale the particle positions.
 */

-__kernel void scalePositions(float scale, int numMolecules, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global float4* posq,
-        __global int* moleculeAtoms, __global int* moleculeStartIndex) {
+__kernel void scalePositions(float scale, int numMolecules, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global float4* restrict posq,
+        __global const int* restrict moleculeAtoms, __global const int* restrict moleculeStartIndex) {
    for (int index = get_global_id(0); index < numMolecules; index += get_global_size(0)) {
        int first = moleculeStartIndex[index];
        int last = moleculeStartIndex[index+1];

--- a/platforms/opencl/src/kernels/nonbonded_cpu.cl
+++ b/platforms/opencl/src/kernels/nonbonded_cpu.cl
@@ -11,11 +11,11 @@ typedef struct {
 * Compute nonbonded interactions.
 */

-__kernel void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions,
-        __global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __local AtomData* localData, __local float4* tempBuffer,
+__kernel void computeNonbonded(__global float4* restrict forceBuffers, __global float* restrict energyBuffer, __global const float4* restrict posq, __global const unsigned int* restrict exclusions,
+        __global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __local AtomData* restrict localData, __local float4* restrict tempBuffer,
        unsigned int startTileIndex, unsigned int endTileIndex,
 #ifdef USE_CUTOFF
-        __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags
+        __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
 #else
        unsigned int numTiles
 #endif

--- a/platforms/opencl/src/kernels/nonbonded_default.cl
+++ b/platforms/opencl/src/kernels/nonbonded_default.cl
@@ -12,11 +12,11 @@ typedef struct {
 */

 __kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
-void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions,
-        __global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __local AtomData* localData, __local float4* tempBuffer,
+void computeNonbonded(__global float4* restrict forceBuffers, __global float* restrict energyBuffer, __global const float4* restrict posq, __global const unsigned int* restrict exclusions,
+        __global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __local AtomData* restrict localData, __local float4* restrict tempBuffer,
        unsigned int startTileIndex, unsigned int endTileIndex,
 #ifdef USE_CUTOFF
-        __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags
+        __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
 #else
        unsigned int numTiles
 #endif

--- a/platforms/opencl/src/kernels/nonbonded_nvidia.cl
+++ b/platforms/opencl/src/kernels/nonbonded_nvidia.cl
@@ -16,15 +16,15 @@ typedef struct {
 */
 __kernel void computeNonbonded(
 #ifdef SUPPORTS_64_BIT_ATOMICS
-        __global long* forceBuffers,
+        __global long* restrict forceBuffers,
 #else
-        __global float4* forceBuffers,
+        __global float4* restrict forceBuffers,
 #endif
-        __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions,
-        __global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __local AtomData* localData, __local float* tempBuffer,
+        __global float* restrict energyBuffer, __global const float4* restrict posq, __global const unsigned int* restrict exclusions,
+        __global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __local AtomData* restrict localData, __local float* restrict tempBuffer,
        unsigned int startTileIndex, unsigned int endTileIndex,
 #ifdef USE_CUTOFF
-        __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags
+        __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
 #else
        unsigned int numTiles
 #endif

--- a/platforms/opencl/src/kernels/pme.cl
+++ b/platforms/opencl/src/kernels/pme.cl
-__kernel void updateBsplines(__global float4* posq, __global float4* pmeBsplineTheta, __local float4* bsplinesCache, __global int2* pmeAtomGridIndex, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
+__kernel void updateBsplines(__global const float4* restrict posq, __global float4* restrict pmeBsplineTheta, __local float4* restrict bsplinesCache,
+        __global int2* restrict pmeAtomGridIndex, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
    const float4 scale = 1.0f/(PME_ORDER-1);
    for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0)) {
        __local float4* data = &bsplinesCache[get_local_id(0)*PME_ORDER];
@@ -38,7 +39,7 @@ __kernel void updateBsplines(__global float4* posq, __global float4* pmeBsplineT
 /**
 * For each grid point, find the range of sorted atoms associated with that point.
 */
-__kernel void findAtomRangeForGrid(__global int2* pmeAtomGridIndex, __global int* pmeAtomRange, __global float4* posq, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
+__kernel void findAtomRangeForGrid(__global int2* restrict pmeAtomGridIndex, __global int* restrict pmeAtomRange, __global const float4* restrict posq, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
    int start = (NUM_ATOMS*get_global_id(0))/get_global_size(0);
    int end = (NUM_ATOMS*(get_global_id(0)+1))/get_global_size(0);
    int last = (start == 0 ? -1 : pmeAtomGridIndex[start-1].y);
@@ -75,7 +76,8 @@ __kernel void findAtomRangeForGrid(__global int2* pmeAtomGridIndex, __global int
 #define BUFFER_SIZE (PME_ORDER*PME_ORDER*PME_ORDER)

 __kernel __attribute__((reqd_work_group_size(BUFFER_SIZE, 1, 1)))
-__kernel void gridSpreadCharge(__global float4* posq, __global int2* pmeAtomGridIndex, __global int* pmeAtomRange, __global long* pmeGrid, __global float4* pmeBsplineTheta, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
+void gridSpreadCharge(__global const float4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange,
+        __global long* restrict pmeGrid, __global const float4* restrict pmeBsplineTheta, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
    int ix = get_local_id(0)/(PME_ORDER*PME_ORDER);
    int remainder = get_local_id(0)-ix*PME_ORDER*PME_ORDER;
    int iy = remainder/PME_ORDER;
@@ -122,7 +124,7 @@ __kernel void gridSpreadCharge(__global float4* posq, __global int2* pmeAtomGrid
    }
 }

-__kernel void finishSpreadCharge(__global long* pmeGrid) {
+__kernel void finishSpreadCharge(__global long* restrict pmeGrid) {
    __global float2* floatGrid = (__global float2*) pmeGrid;
    const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
    float scale = EPSILON_FACTOR/(float) 0xFFFFFFFF;
@@ -133,7 +135,8 @@ __kernel void finishSpreadCharge(__global long* pmeGrid) {
    }
 }
 #else
-__kernel void gridSpreadCharge(__global float4* posq, __global int2* pmeAtomGridIndex, __global int* pmeAtomRange, __global float2* pmeGrid, __global float4* pmeBsplineTheta) {
+__kernel void gridSpreadCharge(__global const float4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange,
+        __global float2* restrict pmeGrid, __global const float4* restrict pmeBsplineTheta) {
    unsigned int numGridPoints = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
    for (int gridIndex = get_global_id(0); gridIndex < numGridPoints; gridIndex += get_global_size(0)) {
        // Compute the charge on a grid point.
@@ -190,8 +193,8 @@ __kernel void gridSpreadCharge(__global float4* posq, __global int2* pmeAtomGrid
 }
 #endif

-__kernel void reciprocalConvolution(__global float2* pmeGrid, __global float* energyBuffer, __global float* pmeBsplineModuliX,
-        __global float* pmeBsplineModuliY, __global float* pmeBsplineModuliZ, float4 invPeriodicBoxSize, float recipScaleFactor) {
+__kernel void reciprocalConvolution(__global float2* restrict pmeGrid, __global float* restrict energyBuffer, __global const float* restrict pmeBsplineModuliX,
+        __global const float* restrict pmeBsplineModuliY, __global const float* restrict pmeBsplineModuliZ, float4 invPeriodicBoxSize, float recipScaleFactor) {
    const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
    float energy = 0.0f;
    for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
@@ -220,7 +223,8 @@ __kernel void reciprocalConvolution(__global float2* pmeGrid, __global float* en
    energyBuffer[get_global_id(0)] += 0.5f*energy;
 }

-__kernel void gridInterpolateForce(__global float4* posq, __global float4* forceBuffers, __global float2* pmeGrid, float4 periodicBoxSize, float4 invPeriodicBoxSize, __local float4* bsplinesCache) {
+__kernel void gridInterpolateForce(__global const float4* restrict posq, __global float4* restrict forceBuffers, __global const float2* restrict pmeGrid,
+        float4 periodicBoxSize, float4 invPeriodicBoxSize, __local float4* restrict bsplinesCache) {
    const float4 scale = 1.0f/(PME_ORDER-1);
    __local float4* data = &bsplinesCache[get_local_id(0)*PME_ORDER];
    __local float4* ddata = &bsplinesCache[get_local_id(0)*PME_ORDER + get_local_size(0)*PME_ORDER];

--- a/platforms/opencl/src/kernels/pme_cpu.cl
+++ b/platforms/opencl/src/kernels/pme_cpu.cl
-__kernel void updateBsplines(__global float4* posq, __global float4* pmeBsplineTheta, __global float4* pmeBsplineDTheta, __local float4* bsplinesCache, __global int2* pmeAtomGridIndex, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
+__kernel void updateBsplines(__global const float4* restrict posq, __global float4* restrict pmeBsplineTheta, __global float4* restrict pmeBsplineDTheta, __local float4* restrict bsplinesCache, __global int2* restrict pmeAtomGridIndex, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
    const float4 scale = 1.0f/(PME_ORDER-1);
    for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0)) {
        __local float4* data = &bsplinesCache[get_local_id(0)*PME_ORDER];
@@ -42,10 +42,10 @@ __kernel void updateBsplines(__global float4* posq, __global float4* pmeBsplineT
 /**
 * This kernel is not actually used when running on a CPU.
 */
-__kernel void findAtomRangeForGrid(__global int2* pmeAtomGridIndex, __global int* pmeAtomRange, __global float4* posq, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
+__kernel void findAtomRangeForGrid(__global const int2* restrict pmeAtomGridIndex, __global int* restrict pmeAtomRange, __global const float4* restrict posq, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
 }

-__kernel void gridSpreadCharge(__global float4* posq, __global int2* pmeAtomGridIndex, __global int* pmeAtomRange, __global float2* pmeGrid, __global float4* pmeBsplineTheta, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
+__kernel void gridSpreadCharge(__global const float4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange, __global float2* restrict pmeGrid, __global const float4* restrict pmeBsplineTheta, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
    const int firstx = get_global_id(0)*GRID_SIZE_X/get_global_size(0);
    const int lastx = (get_global_id(0)+1)*GRID_SIZE_X/get_global_size(0);
    for (int gridIndex = firstx*GRID_SIZE_Y*GRID_SIZE_Z; gridIndex < lastx*GRID_SIZE_Y*GRID_SIZE_Z; gridIndex++)
@@ -82,8 +82,8 @@ __kernel void gridSpreadCharge(__global float4* posq, __global int2* pmeAtomGrid
    }
 }

-__kernel void reciprocalConvolution(__global float2* pmeGrid, __global float* energyBuffer, __global float* pmeBsplineModuliX,
-        __global float* pmeBsplineModuliY, __global float* pmeBsplineModuliZ, float4 invPeriodicBoxSize, float recipScaleFactor) {
+__kernel void reciprocalConvolution(__global float2* restrict pmeGrid, __global float* restrict energyBuffer, __global const float* restrict pmeBsplineModuliX,
+        __global const float* restrict pmeBsplineModuliY, __global const float* restrict pmeBsplineModuliZ, float4 invPeriodicBoxSize, float recipScaleFactor) {
    const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
    float energy = 0.0f;
    for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
@@ -112,7 +112,7 @@ __kernel void reciprocalConvolution(__global float2* pmeGrid, __global float* en
    energyBuffer[get_global_id(0)] += 0.5f*energy;
 }

-__kernel void gridInterpolateForce(__global float4* posq, __global float4* forceBuffers, __global float4* pmeBsplineTheta, __global float4* pmeBsplineDTheta, __global float2* pmeGrid, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
+__kernel void gridInterpolateForce(__global const float4* restrict posq, __global float4* restrict forceBuffers, __global const float4* restrict pmeBsplineTheta, __global const float4* restrict pmeBsplineDTheta, __global const float2* restrict pmeGrid, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
    for (int atom = get_global_id(0); atom < NUM_ATOMS; atom += get_global_size(0)) {
        float4 force = 0.0f;
        float4 pos = posq[atom];

--- a/platforms/opencl/src/kernels/random.cl
+++ b/platforms/opencl/src/kernels/random.cl
@@ -2,7 +2,7 @@
 * Generate random numbers
 */

-__kernel void generateRandomNumbers(int numValues, __global float4* random, __global uint4* seed) {
+__kernel void generateRandomNumbers(int numValues, __global float4* restrict random, __global uint4* restrict seed) {
    int index = get_global_id(0);
    uint4 state = seed[index];
    unsigned int carry = 0;

--- a/platforms/opencl/src/kernels/removeCM.cl
+++ b/platforms/opencl/src/kernels/removeCM.cl
@@ -2,7 +2,7 @@
 * Calculate the center of mass momentum.
 */

-__kernel void calcCenterOfMassMomentum(int numAtoms, __global float4* velm, __global float4* cmMomentum, __local float4* temp) {
+__kernel void calcCenterOfMassMomentum(int numAtoms, __global const float4* restrict velm, __global float4* restrict cmMomentum, __local float4* restrict temp) {
    int index = get_global_id(0);
    float4 cm = 0.0f;
    while (index < numAtoms) {
@@ -53,7 +53,7 @@ __kernel void calcCenterOfMassMomentum(int numAtoms, __global float4* velm, __gl
 * Remove center of mass motion.
 */

-__kernel void removeCenterOfMassMomentum(int numAtoms, __global float4* velm, __global float4* cmMomentum, __local float4* temp) {
+__kernel void removeCenterOfMassMomentum(int numAtoms, __global float4* restrict velm, __global const float4* restrict cmMomentum, __local float4* restrict temp) {
    // First sum all of the momenta that were calculated by individual groups.

    int index = get_local_id(0);

--- a/platforms/opencl/src/kernels/settle.cl
+++ b/platforms/opencl/src/kernels/settle.cl
@@ -2,7 +2,7 @@
 * Enforce constraints on SETTLE clusters
 */

-__kernel void applySettle(int numClusters, float tol, __global float4* oldPos, __global float4* posDelta, __global float4* newDelta, __global float4* velm, __global int4* clusterAtoms, __global float2* clusterParams) {
+__kernel void applySettle(int numClusters, float tol, __global const float4* restrict oldPos, __global const float4* restrict posDelta, __global float4* restrict newDelta, __global const float4* restrict velm, __global const int4* restrict clusterAtoms, __global const float2* restrict clusterParams) {
    int index = get_global_id(0);
    while (index < numClusters) {
        // Load the data for this cluster.

--- a/platforms/opencl/src/kernels/shakeHydrogens.cl
+++ b/platforms/opencl/src/kernels/shakeHydrogens.cl
@@ -2,7 +2,7 @@
 * Enforce constraints on SHAKE clusters
 */

-__kernel void applyShakeToHydrogens(int numClusters, float tol, __global float4* oldPos, __global float4* posDelta, __global float4* newDelta, __global int4* clusterAtoms, __global float4* clusterParams) {
+__kernel void applyShakeToHydrogens(int numClusters, float tol, __global const float4* restrict oldPos, __global const float4* restrict posDelta, __global float4* restrict newDelta, __global const int4* restrict clusterAtoms, __global const float4* restrict clusterParams) {
    int index = get_global_id(0);
    while (index < numClusters) {
        // Load the data for this cluster.

--- a/platforms/opencl/src/kernels/sort.cl
+++ b/platforms/opencl/src/kernels/sort.cl
@@ -8,7 +8,7 @@ float getValue(TYPE value) {
 * Calculate the minimum and maximum value in the array to be sorted.  This kernel
 * is executed as a single work group.
 */
-__kernel void computeRange(__global TYPE* data, int length, __global float2* range, __local float* buffer) {
+__kernel void computeRange(__global const TYPE* restrict data, int length, __global float2* restrict range, __local float* restrict buffer) {
    float minimum = MAXFLOAT;
    float maximum = -MAXFLOAT;

@@ -45,8 +45,8 @@ __kernel void computeRange(__global TYPE* data, int length, __global float2* ran
 /**
 * Assign elements to buckets.
 */
-__kernel void assignElementsToBuckets(__global TYPE* data, int length, int numBuckets, __global float2* range,
-        __global int* bucketOffset, __global int* bucketOfElement, __global int* offsetInBucket) {
+__kernel void assignElementsToBuckets(__global const TYPE* restrict data, int length, int numBuckets, __global const float2* restrict range,
+        __global int* bucketOffset, __global int* restrict bucketOfElement, __global int* restrict offsetInBucket) {
 #ifdef AMD_ATOMIC_WORK_AROUND
    // Do a byte write to force all memory accesses to interactionCount to use the complete path.
    // This avoids the atomic access from causing all word accesses to other buffers from using the slow complete path.
@@ -72,7 +72,7 @@ __kernel void assignElementsToBuckets(__global TYPE* data, int length, int numBu
 * Sum the bucket sizes to compute the start position of each bucket.  This kernel
 * is executed as a single work group.
 */
-__kernel void computeBucketPositions(int numBuckets, __global int* bucketOffset, __local int* buffer) {
+__kernel void computeBucketPositions(int numBuckets, __global int* restrict bucketOffset, __local int* restrict buffer) {
    int globalOffset = 0;
    for (int startBucket = 0; startBucket < numBuckets; startBucket += get_local_size(0)) {
        // Load the bucket sizes into local memory.
@@ -101,7 +101,7 @@ __kernel void computeBucketPositions(int numBuckets, __global int* bucketOffset,
 /**
 * Copy the input data into the buckets for sorting.
 */
-__kernel void copyDataToBuckets(__global TYPE* data, __global TYPE* buckets, int length, __global int* bucketOffset, __global int* bucketOfElement, __global int* offsetInBucket) {
+__kernel void copyDataToBuckets(__global const TYPE* restrict data, __global TYPE* restrict buckets, int length, __global const int* restrict bucketOffset, __global const int* restrict bucketOfElement, __global const int* restrict offsetInBucket) {
    for (int index = get_global_id(0); index < length; index += get_global_size(0)) {
        TYPE element = data[index];
        int bucketIndex = bucketOfElement[index];
@@ -113,7 +113,7 @@ __kernel void copyDataToBuckets(__global TYPE* data, __global TYPE* buckets, int
 /**
 * Sort the data in each bucket.
 */
-__kernel void sortBuckets(__global TYPE* data, __global TYPE* buckets, int numBuckets, __global int* bucketOffset, __local TYPE* buffer) {
+__kernel void sortBuckets(__global TYPE* restrict data, __global const TYPE* restrict buckets, int numBuckets, __global const int* restrict bucketOffset, __local TYPE* restrict buffer) {
    for (int index = get_group_id(0); index < numBuckets; index += get_num_groups(0)) {
        int startIndex = (index == 0 ? 0 : bucketOffset[index-1]);
        int endIndex = bucketOffset[index];

--- a/platforms/opencl/src/kernels/utilities.cl
+++ b/platforms/opencl/src/kernels/utilities.cl
@@ -2,7 +2,7 @@
 * Fill a buffer with 0.
 */

-__kernel void clearBuffer(__global int* buffer, int size) {
+__kernel void clearBuffer(__global int* restrict buffer, int size) {
    int index = get_global_id(0);
    __global int4* buffer4 = (__global int4*) buffer;
    int sizeDiv4 = size/4;
@@ -18,7 +18,7 @@ __kernel void clearBuffer(__global int* buffer, int size) {
 /**
 * Fill two buffers with 0.
 */
-__kernel void clearTwoBuffers(__global int* buffer1, int size1, __global int* buffer2, int size2) {
+__kernel void clearTwoBuffers(__global int* restrict buffer1, int size1, __global int* restrict buffer2, int size2) {
    clearBuffer(buffer1, size1);
    clearBuffer(buffer2, size2);
 }
@@ -26,7 +26,7 @@ __kernel void clearTwoBuffers(__global int* buffer1, int size1, __global int* bu
 /**
 * Fill three buffers with 0.
 */
-__kernel void clearThreeBuffers(__global int* buffer1, int size1, __global int* buffer2, int size2, __global int* buffer3, int size3) {
+__kernel void clearThreeBuffers(__global int* restrict buffer1, int size1, __global int* restrict buffer2, int size2, __global int* restrict buffer3, int size3) {
    clearBuffer(buffer1, size1);
    clearBuffer(buffer2, size2);
    clearBuffer(buffer3, size3);
@@ -35,7 +35,7 @@ __kernel void clearThreeBuffers(__global int* buffer1, int size1, __global int*
 /**
 * Fill four buffers with 0.
 */
-__kernel void clearFourBuffers(__global int* buffer1, int size1, __global int* buffer2, int size2, __global int* buffer3, int size3, __global int* buffer4, int size4) {
+__kernel void clearFourBuffers(__global int* restrict buffer1, int size1, __global int* restrict buffer2, int size2, __global int* restrict buffer3, int size3, __global int* restrict buffer4, int size4) {
    clearBuffer(buffer1, size1);
    clearBuffer(buffer2, size2);
    clearBuffer(buffer3, size3);
@@ -45,7 +45,7 @@ __kernel void clearFourBuffers(__global int* buffer1, int size1, __global int* b
 /**
 * Fill five buffers with 0.
 */
-__kernel void clearFiveBuffers(__global int* buffer1, int size1, __global int* buffer2, int size2, __global int* buffer3, int size3, __global int* buffer4, int size4, __global int* buffer5, int size5) {
+__kernel void clearFiveBuffers(__global int* restrict buffer1, int size1, __global int* restrict buffer2, int size2, __global int* restrict buffer3, int size3, __global int* restrict buffer4, int size4, __global int* restrict buffer5, int size5) {
    clearBuffer(buffer1, size1);
    clearBuffer(buffer2, size2);
    clearBuffer(buffer3, size3);
@@ -56,7 +56,7 @@ __kernel void clearFiveBuffers(__global int* buffer1, int size1, __global int* b
 /**
 * Fill six buffers with 0.
 */
-__kernel void clearSixBuffers(__global int* buffer1, int size1, __global int* buffer2, int size2, __global int* buffer3, int size3, __global int* buffer4, int size4, __global int* buffer5, int size5, __global int* buffer6, int size6) {
+__kernel void clearSixBuffers(__global int* restrict buffer1, int size1, __global int* restrict buffer2, int size2, __global int* restrict buffer3, int size3, __global int* restrict buffer4, int size4, __global int* restrict buffer5, int size5, __global int* restrict buffer6, int size6) {
    clearBuffer(buffer1, size1);
    clearBuffer(buffer2, size2);
    clearBuffer(buffer3, size3);
@@ -69,7 +69,7 @@ __kernel void clearSixBuffers(__global int* buffer1, int size1, __global int* bu
 * Sum a collection of buffers into the first one.
 */

-__kernel void reduceFloat4Buffer(__global float4* buffer, int bufferSize, int numBuffers) {
+__kernel void reduceFloat4Buffer(__global float4* restrict buffer, int bufferSize, int numBuffers) {
    int index = get_global_id(0);
    int totalSize = bufferSize*numBuffers;
    while (index < bufferSize) {
@@ -84,7 +84,7 @@ __kernel void reduceFloat4Buffer(__global float4* buffer, int bufferSize, int nu
 /**
 * Sum the various buffers containing forces.
 */
-__kernel void reduceForces(__global long* longBuffer, __global float4* buffer, int bufferSize, int numBuffers) {
+__kernel void reduceForces(__global const long* restrict longBuffer, __global float4* restrict buffer, int bufferSize, int numBuffers) {
    int totalSize = bufferSize*numBuffers;
    float scale = 1.0f/(float) 0xFFFFFFFF;
    for (int index = get_global_id(0); index < bufferSize; index += get_global_size(0)) {
@@ -99,7 +99,7 @@ __kernel void reduceForces(__global long* longBuffer, __global float4* buffer, i
 * This is called to determine the accuracy of various native functions.
 */

-__kernel void determineNativeAccuracy(__global float8* values, int numValues) {
+__kernel void determineNativeAccuracy(__global float8* restrict values, int numValues) {
    for (int i = get_global_id(0); i < numValues; i += get_global_size(0)) {
        float v = values[i].s0;
        values[i] = (float8) (v, native_sqrt(v), native_rsqrt(v), native_recip(v), native_exp(v), native_log(v), 0.0f, 0.0f);

--- a/platforms/opencl/src/kernels/verlet.cl
+++ b/platforms/opencl/src/kernels/verlet.cl
@@ -6,7 +6,7 @@
 * Perform the first step of verlet integration.
 */

-__kernel void integrateVerletPart1(int numAtoms, __global float2* dt, __global float4* posq, __global float4* velm, __global float4* force, __global float4* posDelta) {
+__kernel void integrateVerletPart1(int numAtoms, __global const float2* restrict dt, __global const float4* restrict posq, __global float4* restrict velm, __global const float4* restrict force, __global float4* restrict posDelta) {
    float2 stepSize = dt[0];
    float dtPos = stepSize.y;
    float dtVel = 0.5f*(stepSize.x+stepSize.y);
@@ -26,7 +26,7 @@ __kernel void integrateVerletPart1(int numAtoms, __global float2* dt, __global f
 * Perform the second step of verlet integration.
 */

-__kernel void integrateVerletPart2(int numAtoms, __global float2* dt, __global float4* posq, __global float4* velm, __global float4* posDelta) {
+__kernel void integrateVerletPart2(int numAtoms, __global float2* restrict dt, __global float4* restrict posq, __global float4* restrict velm, __global const float4* restrict posDelta) {
    float2 stepSize = dt[0];
 #ifdef cl_khr_fp64
    double oneOverDt = 1.0/stepSize.y;
@@ -57,7 +57,7 @@ __kernel void integrateVerletPart2(int numAtoms, __global float2* dt, __global f
 * Select the step size to use for the next step.
 */

-__kernel void selectVerletStepSize(int numAtoms, float maxStepSize, float errorTol, __global float2* dt, __global float4* velm, __global float4* force, __local float* error) {
+__kernel void selectVerletStepSize(int numAtoms, float maxStepSize, float errorTol, __global float2* restrict dt, __global const float4* restrict velm, __global const float4* restrict force, __local float* restrict error) {
    // Calculate the error.

    float err = 0.0f;