Commit 69e75377 authored by Peter Eastman's avatar Peter Eastman
Browse files

Added "const" and "restrict" to lots of kernel arguments to let the compiler do more optimizations

parent bf8b9f30
......@@ -8,11 +8,11 @@
__kernel void reduceBornSum(int bufferSize, int numBuffers, float alpha, float beta, float gamma,
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* bornSum,
__global const long* restrict bornSum,
#else
__global float* bornSum,
__global const float* restrict bornSum,
#endif
__global float2* params, __global float* bornRadii, __global float* obcChain) {
__global const float2* restrict params, __global float* restrict bornRadii, __global float* restrict obcChain) {
unsigned int index = get_global_id(0);
while (index < NUM_ATOMS) {
// Get summed Born data
......@@ -49,9 +49,9 @@ __kernel void reduceBornSum(int bufferSize, int numBuffers, float alpha, float b
__kernel void reduceBornForce(int bufferSize, int numBuffers, __global float* bornForce,
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* bornForceIn,
__global const long* restrict bornForceIn,
#endif
__global float* energyBuffer, __global float2* params, __global float* bornRadii, __global float* obcChain) {
__global float* restrict energyBuffer, __global const float2* restrict params, __global const float* restrict bornRadii, __global const float* restrict obcChain) {
float energy = 0.0f;
unsigned int index = get_global_id(0);
while (index < NUM_ATOMS) {
......
......@@ -14,10 +14,10 @@ typedef struct {
* Compute the Born sum.
*/
__kernel void computeBornSum(__global float* global_bornSum, __global float4* posq, __global float2* global_params,
__local AtomData* localData, __local float* tempBuffer,
__kernel void computeBornSum(__global float* restrict global_bornSum, __global const float4* restrict posq, __global const float2* restrict global_params,
__local AtomData* restrict localData, __local float* restrict tempBuffer,
#ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags) {
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags) {
#else
unsigned int numTiles) {
#endif
......@@ -190,11 +190,11 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po
* First part of computing the GBSA interaction.
*/
__kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer,
__global float4* posq, __global float* global_bornRadii, __global float* global_bornForce,
__local AtomData* localData, __local float4* tempBuffer,
__kernel void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* restrict energyBuffer,
__global const float4* restrict posq, __global const float* restrict global_bornRadii, __global float* restrict global_bornForce,
__local AtomData* restrict localData, __local float4* restrict tempBuffer,
#ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags) {
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags) {
#else
unsigned int numTiles) {
#endif
......
......@@ -12,10 +12,10 @@ typedef struct {
*/
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeBornSum(__global float* global_bornSum, __global float4* posq, __global float2* global_params,
__local AtomData1* localData, __local float* tempBuffer,
void computeBornSum(__global float* restrict global_bornSum, __global const float4* restrict posq, __global const float2* restrict global_params,
__local AtomData1* restrict localData, __local float* restrict tempBuffer,
#ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) {
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) {
#else
unsigned int numTiles) {
#endif
......@@ -205,11 +205,11 @@ typedef struct {
*/
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeGBSAForce1(__global float4* forceBuffers, __global float* global_bornForce,
__global float* energyBuffer, __global float4* posq, __global float* global_bornRadii,
__local AtomData2* localData, __local float4* tempBuffer,
void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* restrict global_bornForce,
__global float* restrict energyBuffer, __global const float4* restrict posq, __global const float* restrict global_bornRadii,
__local AtomData2* restrict localData, __local float4* restrict tempBuffer,
#ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) {
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) {
#else
unsigned int numTiles) {
#endif
......
......@@ -16,14 +16,14 @@ typedef struct {
*/
__kernel void computeBornSum(
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* global_bornSum,
__global long* restrict global_bornSum,
#else
__global float* global_bornSum,
__global float* restrict global_bornSum,
#endif
__global float4* posq, __global float2* global_params,
__local AtomData1* localData, __local float* tempBuffer,
__global const float4* restrict posq, __global const float2* restrict global_params,
__local AtomData1* restrict localData, __local float* restrict tempBuffer,
#ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags,
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags,
#else
unsigned int numTiles,
#endif
......@@ -337,14 +337,14 @@ typedef struct {
__kernel void computeGBSAForce1(
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* forceBuffers, __global long* global_bornForce,
__global long* restrict forceBuffers, __global long* restrict global_bornForce,
#else
__global float4* forceBuffers, __global float* global_bornForce,
__global float4* restrict forceBuffers, __global float* restrict global_bornForce,
#endif
__global float* energyBuffer, __global float4* posq, __global float* global_bornRadii,
__local AtomData2* localData, __local float4* tempBuffer,
__global float* restrict energyBuffer, __global const float4* restrict posq, __global const float* restrict global_bornRadii,
__local AtomData2* restrict localData, __local float4* restrict tempBuffer,
#ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags,
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags,
#else
unsigned int numTiles,
#endif
......
......@@ -8,8 +8,8 @@ enum {VelScale, ForceScale, NoiseScale, MaxParams};
* Perform the first step of Langevin integration.
*/
__kernel void integrateLangevinPart1(__global float4* velm, __global float4* force, __global float4* posDelta,
__global float* paramBuffer, __global float2* dt, __global float4* random, unsigned int randomIndex) {
__kernel void integrateLangevinPart1(__global float4* restrict velm, __global const float4* restrict force, __global float4* restrict posDelta,
__global const float* restrict paramBuffer, __global const float2* restrict dt, __global const float4* restrict random, unsigned int randomIndex) {
float vscale = paramBuffer[VelScale];
float fscale = paramBuffer[ForceScale];
float noisescale = paramBuffer[NoiseScale];
......@@ -31,7 +31,7 @@ __kernel void integrateLangevinPart1(__global float4* velm, __global float4* for
* Perform the second step of Langevin integration.
*/
__kernel void integrateLangevinPart2(__global float4* posq, __global float4* posDelta, __global float4* velm, __global float2* dt) {
__kernel void integrateLangevinPart2(__global float4* restrict posq, __global const float4* restrict posDelta, __global float4* restrict velm, __global const float2* restrict dt) {
#ifdef cl_khr_fp64
double invStepSize = 1.0/dt[0].y;
#else
......@@ -58,8 +58,8 @@ __kernel void integrateLangevinPart2(__global float4* posq, __global float4* pos
* Select the step size to use for the next step.
*/
__kernel void selectLangevinStepSize(float maxStepSize, float errorTol, float tau, float kT, __global float2* dt,
__global float4* velm, __global float4* force, __global float* paramBuffer, __local float* params, __local float* error) {
__kernel void selectLangevinStepSize(float maxStepSize, float errorTol, float tau, float kT, __global float2* restrict dt,
__global const float4* restrict velm, __global const float4* restrict force, __global float* restrict paramBuffer, __local float* restrict params, __local float* restrict error) {
// Calculate the error.
float err = 0.0f;
......
......@@ -2,8 +2,8 @@
* Scale the particle positions.
*/
__kernel void scalePositions(float scale, int numMolecules, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global float4* posq,
__global int* moleculeAtoms, __global int* moleculeStartIndex) {
__kernel void scalePositions(float scale, int numMolecules, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global float4* restrict posq,
__global const int* restrict moleculeAtoms, __global const int* restrict moleculeStartIndex) {
for (int index = get_global_id(0); index < numMolecules; index += get_global_size(0)) {
int first = moleculeStartIndex[index];
int last = moleculeStartIndex[index+1];
......
......@@ -11,11 +11,11 @@ typedef struct {
* Compute nonbonded interactions.
*/
__kernel void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions,
__global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __local AtomData* localData, __local float4* tempBuffer,
__kernel void computeNonbonded(__global float4* restrict forceBuffers, __global float* restrict energyBuffer, __global const float4* restrict posq, __global const unsigned int* restrict exclusions,
__global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __local AtomData* restrict localData, __local float4* restrict tempBuffer,
unsigned int startTileIndex, unsigned int endTileIndex,
#ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
#else
unsigned int numTiles
#endif
......
......@@ -12,11 +12,11 @@ typedef struct {
*/
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions,
__global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __local AtomData* localData, __local float4* tempBuffer,
void computeNonbonded(__global float4* restrict forceBuffers, __global float* restrict energyBuffer, __global const float4* restrict posq, __global const unsigned int* restrict exclusions,
__global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __local AtomData* restrict localData, __local float4* restrict tempBuffer,
unsigned int startTileIndex, unsigned int endTileIndex,
#ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
#else
unsigned int numTiles
#endif
......
......@@ -16,15 +16,15 @@ typedef struct {
*/
__kernel void computeNonbonded(
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* forceBuffers,
__global long* restrict forceBuffers,
#else
__global float4* forceBuffers,
__global float4* restrict forceBuffers,
#endif
__global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions,
__global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __local AtomData* localData, __local float* tempBuffer,
__global float* restrict energyBuffer, __global const float4* restrict posq, __global const unsigned int* restrict exclusions,
__global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __local AtomData* restrict localData, __local float* restrict tempBuffer,
unsigned int startTileIndex, unsigned int endTileIndex,
#ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
#else
unsigned int numTiles
#endif
......
__kernel void updateBsplines(__global float4* posq, __global float4* pmeBsplineTheta, __local float4* bsplinesCache, __global int2* pmeAtomGridIndex, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
__kernel void updateBsplines(__global const float4* restrict posq, __global float4* restrict pmeBsplineTheta, __local float4* restrict bsplinesCache,
__global int2* restrict pmeAtomGridIndex, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
const float4 scale = 1.0f/(PME_ORDER-1);
for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0)) {
__local float4* data = &bsplinesCache[get_local_id(0)*PME_ORDER];
......@@ -38,7 +39,7 @@ __kernel void updateBsplines(__global float4* posq, __global float4* pmeBsplineT
/**
* For each grid point, find the range of sorted atoms associated with that point.
*/
__kernel void findAtomRangeForGrid(__global int2* pmeAtomGridIndex, __global int* pmeAtomRange, __global float4* posq, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
__kernel void findAtomRangeForGrid(__global int2* restrict pmeAtomGridIndex, __global int* restrict pmeAtomRange, __global const float4* restrict posq, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
int start = (NUM_ATOMS*get_global_id(0))/get_global_size(0);
int end = (NUM_ATOMS*(get_global_id(0)+1))/get_global_size(0);
int last = (start == 0 ? -1 : pmeAtomGridIndex[start-1].y);
......@@ -75,7 +76,8 @@ __kernel void findAtomRangeForGrid(__global int2* pmeAtomGridIndex, __global int
#define BUFFER_SIZE (PME_ORDER*PME_ORDER*PME_ORDER)
__kernel __attribute__((reqd_work_group_size(BUFFER_SIZE, 1, 1)))
__kernel void gridSpreadCharge(__global float4* posq, __global int2* pmeAtomGridIndex, __global int* pmeAtomRange, __global long* pmeGrid, __global float4* pmeBsplineTheta, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
void gridSpreadCharge(__global const float4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange,
__global long* restrict pmeGrid, __global const float4* restrict pmeBsplineTheta, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
int ix = get_local_id(0)/(PME_ORDER*PME_ORDER);
int remainder = get_local_id(0)-ix*PME_ORDER*PME_ORDER;
int iy = remainder/PME_ORDER;
......@@ -122,7 +124,7 @@ __kernel void gridSpreadCharge(__global float4* posq, __global int2* pmeAtomGrid
}
}
__kernel void finishSpreadCharge(__global long* pmeGrid) {
__kernel void finishSpreadCharge(__global long* restrict pmeGrid) {
__global float2* floatGrid = (__global float2*) pmeGrid;
const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
float scale = EPSILON_FACTOR/(float) 0xFFFFFFFF;
......@@ -133,7 +135,8 @@ __kernel void finishSpreadCharge(__global long* pmeGrid) {
}
}
#else
__kernel void gridSpreadCharge(__global float4* posq, __global int2* pmeAtomGridIndex, __global int* pmeAtomRange, __global float2* pmeGrid, __global float4* pmeBsplineTheta) {
__kernel void gridSpreadCharge(__global const float4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange,
__global float2* restrict pmeGrid, __global const float4* restrict pmeBsplineTheta) {
unsigned int numGridPoints = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
for (int gridIndex = get_global_id(0); gridIndex < numGridPoints; gridIndex += get_global_size(0)) {
// Compute the charge on a grid point.
......@@ -190,8 +193,8 @@ __kernel void gridSpreadCharge(__global float4* posq, __global int2* pmeAtomGrid
}
#endif
__kernel void reciprocalConvolution(__global float2* pmeGrid, __global float* energyBuffer, __global float* pmeBsplineModuliX,
__global float* pmeBsplineModuliY, __global float* pmeBsplineModuliZ, float4 invPeriodicBoxSize, float recipScaleFactor) {
__kernel void reciprocalConvolution(__global float2* restrict pmeGrid, __global float* restrict energyBuffer, __global const float* restrict pmeBsplineModuliX,
__global const float* restrict pmeBsplineModuliY, __global const float* restrict pmeBsplineModuliZ, float4 invPeriodicBoxSize, float recipScaleFactor) {
const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
float energy = 0.0f;
for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
......@@ -220,7 +223,8 @@ __kernel void reciprocalConvolution(__global float2* pmeGrid, __global float* en
energyBuffer[get_global_id(0)] += 0.5f*energy;
}
__kernel void gridInterpolateForce(__global float4* posq, __global float4* forceBuffers, __global float2* pmeGrid, float4 periodicBoxSize, float4 invPeriodicBoxSize, __local float4* bsplinesCache) {
__kernel void gridInterpolateForce(__global const float4* restrict posq, __global float4* restrict forceBuffers, __global const float2* restrict pmeGrid,
float4 periodicBoxSize, float4 invPeriodicBoxSize, __local float4* restrict bsplinesCache) {
const float4 scale = 1.0f/(PME_ORDER-1);
__local float4* data = &bsplinesCache[get_local_id(0)*PME_ORDER];
__local float4* ddata = &bsplinesCache[get_local_id(0)*PME_ORDER + get_local_size(0)*PME_ORDER];
......
__kernel void updateBsplines(__global float4* posq, __global float4* pmeBsplineTheta, __global float4* pmeBsplineDTheta, __local float4* bsplinesCache, __global int2* pmeAtomGridIndex, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
__kernel void updateBsplines(__global const float4* restrict posq, __global float4* restrict pmeBsplineTheta, __global float4* restrict pmeBsplineDTheta, __local float4* restrict bsplinesCache, __global int2* restrict pmeAtomGridIndex, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
const float4 scale = 1.0f/(PME_ORDER-1);
for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0)) {
__local float4* data = &bsplinesCache[get_local_id(0)*PME_ORDER];
......@@ -42,10 +42,10 @@ __kernel void updateBsplines(__global float4* posq, __global float4* pmeBsplineT
/**
* This kernel is not actually used when running on a CPU.
*/
__kernel void findAtomRangeForGrid(__global int2* pmeAtomGridIndex, __global int* pmeAtomRange, __global float4* posq, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
__kernel void findAtomRangeForGrid(__global const int2* restrict pmeAtomGridIndex, __global int* restrict pmeAtomRange, __global const float4* restrict posq, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
}
__kernel void gridSpreadCharge(__global float4* posq, __global int2* pmeAtomGridIndex, __global int* pmeAtomRange, __global float2* pmeGrid, __global float4* pmeBsplineTheta, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
__kernel void gridSpreadCharge(__global const float4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange, __global float2* restrict pmeGrid, __global const float4* restrict pmeBsplineTheta, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
const int firstx = get_global_id(0)*GRID_SIZE_X/get_global_size(0);
const int lastx = (get_global_id(0)+1)*GRID_SIZE_X/get_global_size(0);
for (int gridIndex = firstx*GRID_SIZE_Y*GRID_SIZE_Z; gridIndex < lastx*GRID_SIZE_Y*GRID_SIZE_Z; gridIndex++)
......@@ -82,8 +82,8 @@ __kernel void gridSpreadCharge(__global float4* posq, __global int2* pmeAtomGrid
}
}
__kernel void reciprocalConvolution(__global float2* pmeGrid, __global float* energyBuffer, __global float* pmeBsplineModuliX,
__global float* pmeBsplineModuliY, __global float* pmeBsplineModuliZ, float4 invPeriodicBoxSize, float recipScaleFactor) {
__kernel void reciprocalConvolution(__global float2* restrict pmeGrid, __global float* restrict energyBuffer, __global const float* restrict pmeBsplineModuliX,
__global const float* restrict pmeBsplineModuliY, __global const float* restrict pmeBsplineModuliZ, float4 invPeriodicBoxSize, float recipScaleFactor) {
const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
float energy = 0.0f;
for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
......@@ -112,7 +112,7 @@ __kernel void reciprocalConvolution(__global float2* pmeGrid, __global float* en
energyBuffer[get_global_id(0)] += 0.5f*energy;
}
__kernel void gridInterpolateForce(__global float4* posq, __global float4* forceBuffers, __global float4* pmeBsplineTheta, __global float4* pmeBsplineDTheta, __global float2* pmeGrid, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
__kernel void gridInterpolateForce(__global const float4* restrict posq, __global float4* restrict forceBuffers, __global const float4* restrict pmeBsplineTheta, __global const float4* restrict pmeBsplineDTheta, __global const float2* restrict pmeGrid, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
for (int atom = get_global_id(0); atom < NUM_ATOMS; atom += get_global_size(0)) {
float4 force = 0.0f;
float4 pos = posq[atom];
......
......@@ -2,7 +2,7 @@
* Generate random numbers
*/
__kernel void generateRandomNumbers(int numValues, __global float4* random, __global uint4* seed) {
__kernel void generateRandomNumbers(int numValues, __global float4* restrict random, __global uint4* restrict seed) {
int index = get_global_id(0);
uint4 state = seed[index];
unsigned int carry = 0;
......
......@@ -2,7 +2,7 @@
* Calculate the center of mass momentum.
*/
__kernel void calcCenterOfMassMomentum(int numAtoms, __global float4* velm, __global float4* cmMomentum, __local float4* temp) {
__kernel void calcCenterOfMassMomentum(int numAtoms, __global const float4* restrict velm, __global float4* restrict cmMomentum, __local float4* restrict temp) {
int index = get_global_id(0);
float4 cm = 0.0f;
while (index < numAtoms) {
......@@ -53,7 +53,7 @@ __kernel void calcCenterOfMassMomentum(int numAtoms, __global float4* velm, __gl
* Remove center of mass motion.
*/
__kernel void removeCenterOfMassMomentum(int numAtoms, __global float4* velm, __global float4* cmMomentum, __local float4* temp) {
__kernel void removeCenterOfMassMomentum(int numAtoms, __global float4* restrict velm, __global const float4* restrict cmMomentum, __local float4* restrict temp) {
// First sum all of the momenta that were calculated by individual groups.
int index = get_local_id(0);
......
......@@ -2,7 +2,7 @@
* Enforce constraints on SETTLE clusters
*/
__kernel void applySettle(int numClusters, float tol, __global float4* oldPos, __global float4* posDelta, __global float4* newDelta, __global float4* velm, __global int4* clusterAtoms, __global float2* clusterParams) {
__kernel void applySettle(int numClusters, float tol, __global const float4* restrict oldPos, __global const float4* restrict posDelta, __global float4* restrict newDelta, __global const float4* restrict velm, __global const int4* restrict clusterAtoms, __global const float2* restrict clusterParams) {
int index = get_global_id(0);
while (index < numClusters) {
// Load the data for this cluster.
......
......@@ -2,7 +2,7 @@
* Enforce constraints on SHAKE clusters
*/
__kernel void applyShakeToHydrogens(int numClusters, float tol, __global float4* oldPos, __global float4* posDelta, __global float4* newDelta, __global int4* clusterAtoms, __global float4* clusterParams) {
__kernel void applyShakeToHydrogens(int numClusters, float tol, __global const float4* restrict oldPos, __global const float4* restrict posDelta, __global float4* restrict newDelta, __global const int4* restrict clusterAtoms, __global const float4* restrict clusterParams) {
int index = get_global_id(0);
while (index < numClusters) {
// Load the data for this cluster.
......
......@@ -8,7 +8,7 @@ float getValue(TYPE value) {
* Calculate the minimum and maximum value in the array to be sorted. This kernel
* is executed as a single work group.
*/
__kernel void computeRange(__global TYPE* data, int length, __global float2* range, __local float* buffer) {
__kernel void computeRange(__global const TYPE* restrict data, int length, __global float2* restrict range, __local float* restrict buffer) {
float minimum = MAXFLOAT;
float maximum = -MAXFLOAT;
......@@ -45,8 +45,8 @@ __kernel void computeRange(__global TYPE* data, int length, __global float2* ran
/**
* Assign elements to buckets.
*/
__kernel void assignElementsToBuckets(__global TYPE* data, int length, int numBuckets, __global float2* range,
__global int* bucketOffset, __global int* bucketOfElement, __global int* offsetInBucket) {
__kernel void assignElementsToBuckets(__global const TYPE* restrict data, int length, int numBuckets, __global const float2* restrict range,
__global int* bucketOffset, __global int* restrict bucketOfElement, __global int* restrict offsetInBucket) {
#ifdef AMD_ATOMIC_WORK_AROUND
// Do a byte write to force all memory accesses to interactionCount to use the complete path.
// This avoids the atomic access from causing all word accesses to other buffers from using the slow complete path.
......@@ -72,7 +72,7 @@ __kernel void assignElementsToBuckets(__global TYPE* data, int length, int numBu
* Sum the bucket sizes to compute the start position of each bucket. This kernel
* is executed as a single work group.
*/
__kernel void computeBucketPositions(int numBuckets, __global int* bucketOffset, __local int* buffer) {
__kernel void computeBucketPositions(int numBuckets, __global int* restrict bucketOffset, __local int* restrict buffer) {
int globalOffset = 0;
for (int startBucket = 0; startBucket < numBuckets; startBucket += get_local_size(0)) {
// Load the bucket sizes into local memory.
......@@ -101,7 +101,7 @@ __kernel void computeBucketPositions(int numBuckets, __global int* bucketOffset,
/**
* Copy the input data into the buckets for sorting.
*/
__kernel void copyDataToBuckets(__global TYPE* data, __global TYPE* buckets, int length, __global int* bucketOffset, __global int* bucketOfElement, __global int* offsetInBucket) {
__kernel void copyDataToBuckets(__global const TYPE* restrict data, __global TYPE* restrict buckets, int length, __global const int* restrict bucketOffset, __global const int* restrict bucketOfElement, __global const int* restrict offsetInBucket) {
for (int index = get_global_id(0); index < length; index += get_global_size(0)) {
TYPE element = data[index];
int bucketIndex = bucketOfElement[index];
......@@ -113,7 +113,7 @@ __kernel void copyDataToBuckets(__global TYPE* data, __global TYPE* buckets, int
/**
* Sort the data in each bucket.
*/
__kernel void sortBuckets(__global TYPE* data, __global TYPE* buckets, int numBuckets, __global int* bucketOffset, __local TYPE* buffer) {
__kernel void sortBuckets(__global TYPE* restrict data, __global const TYPE* restrict buckets, int numBuckets, __global const int* restrict bucketOffset, __local TYPE* restrict buffer) {
for (int index = get_group_id(0); index < numBuckets; index += get_num_groups(0)) {
int startIndex = (index == 0 ? 0 : bucketOffset[index-1]);
int endIndex = bucketOffset[index];
......
......@@ -2,7 +2,7 @@
* Fill a buffer with 0.
*/
__kernel void clearBuffer(__global int* buffer, int size) {
__kernel void clearBuffer(__global int* restrict buffer, int size) {
int index = get_global_id(0);
__global int4* buffer4 = (__global int4*) buffer;
int sizeDiv4 = size/4;
......@@ -18,7 +18,7 @@ __kernel void clearBuffer(__global int* buffer, int size) {
/**
* Fill two buffers with 0.
*/
__kernel void clearTwoBuffers(__global int* buffer1, int size1, __global int* buffer2, int size2) {
__kernel void clearTwoBuffers(__global int* restrict buffer1, int size1, __global int* restrict buffer2, int size2) {
clearBuffer(buffer1, size1);
clearBuffer(buffer2, size2);
}
......@@ -26,7 +26,7 @@ __kernel void clearTwoBuffers(__global int* buffer1, int size1, __global int* bu
/**
* Fill three buffers with 0.
*/
__kernel void clearThreeBuffers(__global int* buffer1, int size1, __global int* buffer2, int size2, __global int* buffer3, int size3) {
__kernel void clearThreeBuffers(__global int* restrict buffer1, int size1, __global int* restrict buffer2, int size2, __global int* restrict buffer3, int size3) {
clearBuffer(buffer1, size1);
clearBuffer(buffer2, size2);
clearBuffer(buffer3, size3);
......@@ -35,7 +35,7 @@ __kernel void clearThreeBuffers(__global int* buffer1, int size1, __global int*
/**
* Fill four buffers with 0.
*/
__kernel void clearFourBuffers(__global int* buffer1, int size1, __global int* buffer2, int size2, __global int* buffer3, int size3, __global int* buffer4, int size4) {
__kernel void clearFourBuffers(__global int* restrict buffer1, int size1, __global int* restrict buffer2, int size2, __global int* restrict buffer3, int size3, __global int* restrict buffer4, int size4) {
clearBuffer(buffer1, size1);
clearBuffer(buffer2, size2);
clearBuffer(buffer3, size3);
......@@ -45,7 +45,7 @@ __kernel void clearFourBuffers(__global int* buffer1, int size1, __global int* b
/**
* Fill five buffers with 0.
*/
__kernel void clearFiveBuffers(__global int* buffer1, int size1, __global int* buffer2, int size2, __global int* buffer3, int size3, __global int* buffer4, int size4, __global int* buffer5, int size5) {
__kernel void clearFiveBuffers(__global int* restrict buffer1, int size1, __global int* restrict buffer2, int size2, __global int* restrict buffer3, int size3, __global int* restrict buffer4, int size4, __global int* restrict buffer5, int size5) {
clearBuffer(buffer1, size1);
clearBuffer(buffer2, size2);
clearBuffer(buffer3, size3);
......@@ -56,7 +56,7 @@ __kernel void clearFiveBuffers(__global int* buffer1, int size1, __global int* b
/**
* Fill six buffers with 0.
*/
__kernel void clearSixBuffers(__global int* buffer1, int size1, __global int* buffer2, int size2, __global int* buffer3, int size3, __global int* buffer4, int size4, __global int* buffer5, int size5, __global int* buffer6, int size6) {
__kernel void clearSixBuffers(__global int* restrict buffer1, int size1, __global int* restrict buffer2, int size2, __global int* restrict buffer3, int size3, __global int* restrict buffer4, int size4, __global int* restrict buffer5, int size5, __global int* restrict buffer6, int size6) {
clearBuffer(buffer1, size1);
clearBuffer(buffer2, size2);
clearBuffer(buffer3, size3);
......@@ -69,7 +69,7 @@ __kernel void clearSixBuffers(__global int* buffer1, int size1, __global int* bu
* Sum a collection of buffers into the first one.
*/
__kernel void reduceFloat4Buffer(__global float4* buffer, int bufferSize, int numBuffers) {
__kernel void reduceFloat4Buffer(__global float4* restrict buffer, int bufferSize, int numBuffers) {
int index = get_global_id(0);
int totalSize = bufferSize*numBuffers;
while (index < bufferSize) {
......@@ -84,7 +84,7 @@ __kernel void reduceFloat4Buffer(__global float4* buffer, int bufferSize, int nu
/**
* Sum the various buffers containing forces.
*/
__kernel void reduceForces(__global long* longBuffer, __global float4* buffer, int bufferSize, int numBuffers) {
__kernel void reduceForces(__global const long* restrict longBuffer, __global float4* restrict buffer, int bufferSize, int numBuffers) {
int totalSize = bufferSize*numBuffers;
float scale = 1.0f/(float) 0xFFFFFFFF;
for (int index = get_global_id(0); index < bufferSize; index += get_global_size(0)) {
......@@ -99,7 +99,7 @@ __kernel void reduceForces(__global long* longBuffer, __global float4* buffer, i
* This is called to determine the accuracy of various native functions.
*/
__kernel void determineNativeAccuracy(__global float8* values, int numValues) {
__kernel void determineNativeAccuracy(__global float8* restrict values, int numValues) {
for (int i = get_global_id(0); i < numValues; i += get_global_size(0)) {
float v = values[i].s0;
values[i] = (float8) (v, native_sqrt(v), native_rsqrt(v), native_recip(v), native_exp(v), native_log(v), 0.0f, 0.0f);
......
......@@ -6,7 +6,7 @@
* Perform the first step of verlet integration.
*/
__kernel void integrateVerletPart1(int numAtoms, __global float2* dt, __global float4* posq, __global float4* velm, __global float4* force, __global float4* posDelta) {
__kernel void integrateVerletPart1(int numAtoms, __global const float2* restrict dt, __global const float4* restrict posq, __global float4* restrict velm, __global const float4* restrict force, __global float4* restrict posDelta) {
float2 stepSize = dt[0];
float dtPos = stepSize.y;
float dtVel = 0.5f*(stepSize.x+stepSize.y);
......@@ -26,7 +26,7 @@ __kernel void integrateVerletPart1(int numAtoms, __global float2* dt, __global f
* Perform the second step of verlet integration.
*/
__kernel void integrateVerletPart2(int numAtoms, __global float2* dt, __global float4* posq, __global float4* velm, __global float4* posDelta) {
__kernel void integrateVerletPart2(int numAtoms, __global float2* restrict dt, __global float4* restrict posq, __global float4* restrict velm, __global const float4* restrict posDelta) {
float2 stepSize = dt[0];
#ifdef cl_khr_fp64
double oneOverDt = 1.0/stepSize.y;
......@@ -57,7 +57,7 @@ __kernel void integrateVerletPart2(int numAtoms, __global float2* dt, __global f
* Select the step size to use for the next step.
*/
__kernel void selectVerletStepSize(int numAtoms, float maxStepSize, float errorTol, __global float2* dt, __global float4* velm, __global float4* force, __local float* error) {
__kernel void selectVerletStepSize(int numAtoms, float maxStepSize, float errorTol, __global float2* restrict dt, __global const float4* restrict velm, __global const float4* restrict force, __local float* restrict error) {
// Calculate the error.
float err = 0.0f;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment