Commit 69e75377 authored by Peter Eastman's avatar Peter Eastman
Browse files

Added "const" and "restrict" to lots of kernel arguments to let the compiler do more optimizations

parent bf8b9f30
...@@ -8,11 +8,11 @@ ...@@ -8,11 +8,11 @@
__kernel void reduceBornSum(int bufferSize, int numBuffers, float alpha, float beta, float gamma, __kernel void reduceBornSum(int bufferSize, int numBuffers, float alpha, float beta, float gamma,
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
__global long* bornSum, __global const long* restrict bornSum,
#else #else
__global float* bornSum, __global const float* restrict bornSum,
#endif #endif
__global float2* params, __global float* bornRadii, __global float* obcChain) { __global const float2* restrict params, __global float* restrict bornRadii, __global float* restrict obcChain) {
unsigned int index = get_global_id(0); unsigned int index = get_global_id(0);
while (index < NUM_ATOMS) { while (index < NUM_ATOMS) {
// Get summed Born data // Get summed Born data
...@@ -49,9 +49,9 @@ __kernel void reduceBornSum(int bufferSize, int numBuffers, float alpha, float b ...@@ -49,9 +49,9 @@ __kernel void reduceBornSum(int bufferSize, int numBuffers, float alpha, float b
__kernel void reduceBornForce(int bufferSize, int numBuffers, __global float* bornForce, __kernel void reduceBornForce(int bufferSize, int numBuffers, __global float* bornForce,
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
__global long* bornForceIn, __global const long* restrict bornForceIn,
#endif #endif
__global float* energyBuffer, __global float2* params, __global float* bornRadii, __global float* obcChain) { __global float* restrict energyBuffer, __global const float2* restrict params, __global const float* restrict bornRadii, __global const float* restrict obcChain) {
float energy = 0.0f; float energy = 0.0f;
unsigned int index = get_global_id(0); unsigned int index = get_global_id(0);
while (index < NUM_ATOMS) { while (index < NUM_ATOMS) {
......
...@@ -14,10 +14,10 @@ typedef struct { ...@@ -14,10 +14,10 @@ typedef struct {
* Compute the Born sum. * Compute the Born sum.
*/ */
__kernel void computeBornSum(__global float* global_bornSum, __global float4* posq, __global float2* global_params, __kernel void computeBornSum(__global float* restrict global_bornSum, __global const float4* restrict posq, __global const float2* restrict global_params,
__local AtomData* localData, __local float* tempBuffer, __local AtomData* restrict localData, __local float* restrict tempBuffer,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags) { __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags) {
#else #else
unsigned int numTiles) { unsigned int numTiles) {
#endif #endif
...@@ -190,11 +190,11 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po ...@@ -190,11 +190,11 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po
* First part of computing the GBSA interaction. * First part of computing the GBSA interaction.
*/ */
__kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer, __kernel void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* restrict energyBuffer,
__global float4* posq, __global float* global_bornRadii, __global float* global_bornForce, __global const float4* restrict posq, __global const float* restrict global_bornRadii, __global float* restrict global_bornForce,
__local AtomData* localData, __local float4* tempBuffer, __local AtomData* restrict localData, __local float4* restrict tempBuffer,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags) { __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags) {
#else #else
unsigned int numTiles) { unsigned int numTiles) {
#endif #endif
......
...@@ -12,10 +12,10 @@ typedef struct { ...@@ -12,10 +12,10 @@ typedef struct {
*/ */
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1))) __kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeBornSum(__global float* global_bornSum, __global float4* posq, __global float2* global_params, void computeBornSum(__global float* restrict global_bornSum, __global const float4* restrict posq, __global const float2* restrict global_params,
__local AtomData1* localData, __local float* tempBuffer, __local AtomData1* restrict localData, __local float* restrict tempBuffer,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) { __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) {
#else #else
unsigned int numTiles) { unsigned int numTiles) {
#endif #endif
...@@ -205,11 +205,11 @@ typedef struct { ...@@ -205,11 +205,11 @@ typedef struct {
*/ */
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1))) __kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeGBSAForce1(__global float4* forceBuffers, __global float* global_bornForce, void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* restrict global_bornForce,
__global float* energyBuffer, __global float4* posq, __global float* global_bornRadii, __global float* restrict energyBuffer, __global const float4* restrict posq, __global const float* restrict global_bornRadii,
__local AtomData2* localData, __local float4* tempBuffer, __local AtomData2* restrict localData, __local float4* restrict tempBuffer,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) { __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) {
#else #else
unsigned int numTiles) { unsigned int numTiles) {
#endif #endif
......
...@@ -16,14 +16,14 @@ typedef struct { ...@@ -16,14 +16,14 @@ typedef struct {
*/ */
__kernel void computeBornSum( __kernel void computeBornSum(
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
__global long* global_bornSum, __global long* restrict global_bornSum,
#else #else
__global float* global_bornSum, __global float* restrict global_bornSum,
#endif #endif
__global float4* posq, __global float2* global_params, __global const float4* restrict posq, __global const float2* restrict global_params,
__local AtomData1* localData, __local float* tempBuffer, __local AtomData1* restrict localData, __local float* restrict tempBuffer,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags, __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags,
#else #else
unsigned int numTiles, unsigned int numTiles,
#endif #endif
...@@ -337,14 +337,14 @@ typedef struct { ...@@ -337,14 +337,14 @@ typedef struct {
__kernel void computeGBSAForce1( __kernel void computeGBSAForce1(
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
__global long* forceBuffers, __global long* global_bornForce, __global long* restrict forceBuffers, __global long* restrict global_bornForce,
#else #else
__global float4* forceBuffers, __global float* global_bornForce, __global float4* restrict forceBuffers, __global float* restrict global_bornForce,
#endif #endif
__global float* energyBuffer, __global float4* posq, __global float* global_bornRadii, __global float* restrict energyBuffer, __global const float4* restrict posq, __global const float* restrict global_bornRadii,
__local AtomData2* localData, __local float4* tempBuffer, __local AtomData2* restrict localData, __local float4* restrict tempBuffer,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags, __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags,
#else #else
unsigned int numTiles, unsigned int numTiles,
#endif #endif
......
...@@ -8,8 +8,8 @@ enum {VelScale, ForceScale, NoiseScale, MaxParams}; ...@@ -8,8 +8,8 @@ enum {VelScale, ForceScale, NoiseScale, MaxParams};
* Perform the first step of Langevin integration. * Perform the first step of Langevin integration.
*/ */
__kernel void integrateLangevinPart1(__global float4* velm, __global float4* force, __global float4* posDelta, __kernel void integrateLangevinPart1(__global float4* restrict velm, __global const float4* restrict force, __global float4* restrict posDelta,
__global float* paramBuffer, __global float2* dt, __global float4* random, unsigned int randomIndex) { __global const float* restrict paramBuffer, __global const float2* restrict dt, __global const float4* restrict random, unsigned int randomIndex) {
float vscale = paramBuffer[VelScale]; float vscale = paramBuffer[VelScale];
float fscale = paramBuffer[ForceScale]; float fscale = paramBuffer[ForceScale];
float noisescale = paramBuffer[NoiseScale]; float noisescale = paramBuffer[NoiseScale];
...@@ -31,7 +31,7 @@ __kernel void integrateLangevinPart1(__global float4* velm, __global float4* for ...@@ -31,7 +31,7 @@ __kernel void integrateLangevinPart1(__global float4* velm, __global float4* for
* Perform the second step of Langevin integration. * Perform the second step of Langevin integration.
*/ */
__kernel void integrateLangevinPart2(__global float4* posq, __global float4* posDelta, __global float4* velm, __global float2* dt) { __kernel void integrateLangevinPart2(__global float4* restrict posq, __global const float4* restrict posDelta, __global float4* restrict velm, __global const float2* restrict dt) {
#ifdef cl_khr_fp64 #ifdef cl_khr_fp64
double invStepSize = 1.0/dt[0].y; double invStepSize = 1.0/dt[0].y;
#else #else
...@@ -58,8 +58,8 @@ __kernel void integrateLangevinPart2(__global float4* posq, __global float4* pos ...@@ -58,8 +58,8 @@ __kernel void integrateLangevinPart2(__global float4* posq, __global float4* pos
* Select the step size to use for the next step. * Select the step size to use for the next step.
*/ */
__kernel void selectLangevinStepSize(float maxStepSize, float errorTol, float tau, float kT, __global float2* dt, __kernel void selectLangevinStepSize(float maxStepSize, float errorTol, float tau, float kT, __global float2* restrict dt,
__global float4* velm, __global float4* force, __global float* paramBuffer, __local float* params, __local float* error) { __global const float4* restrict velm, __global const float4* restrict force, __global float* restrict paramBuffer, __local float* restrict params, __local float* restrict error) {
// Calculate the error. // Calculate the error.
float err = 0.0f; float err = 0.0f;
......
...@@ -2,8 +2,8 @@ ...@@ -2,8 +2,8 @@
* Scale the particle positions. * Scale the particle positions.
*/ */
__kernel void scalePositions(float scale, int numMolecules, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global float4* posq, __kernel void scalePositions(float scale, int numMolecules, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global float4* restrict posq,
__global int* moleculeAtoms, __global int* moleculeStartIndex) { __global const int* restrict moleculeAtoms, __global const int* restrict moleculeStartIndex) {
for (int index = get_global_id(0); index < numMolecules; index += get_global_size(0)) { for (int index = get_global_id(0); index < numMolecules; index += get_global_size(0)) {
int first = moleculeStartIndex[index]; int first = moleculeStartIndex[index];
int last = moleculeStartIndex[index+1]; int last = moleculeStartIndex[index+1];
......
...@@ -11,11 +11,11 @@ typedef struct { ...@@ -11,11 +11,11 @@ typedef struct {
* Compute nonbonded interactions. * Compute nonbonded interactions.
*/ */
__kernel void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions, __kernel void computeNonbonded(__global float4* restrict forceBuffers, __global float* restrict energyBuffer, __global const float4* restrict posq, __global const unsigned int* restrict exclusions,
__global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __local AtomData* localData, __local float4* tempBuffer, __global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __local AtomData* restrict localData, __local float4* restrict tempBuffer,
unsigned int startTileIndex, unsigned int endTileIndex, unsigned int startTileIndex, unsigned int endTileIndex,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
#else #else
unsigned int numTiles unsigned int numTiles
#endif #endif
......
...@@ -12,11 +12,11 @@ typedef struct { ...@@ -12,11 +12,11 @@ typedef struct {
*/ */
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1))) __kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions, void computeNonbonded(__global float4* restrict forceBuffers, __global float* restrict energyBuffer, __global const float4* restrict posq, __global const unsigned int* restrict exclusions,
__global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __local AtomData* localData, __local float4* tempBuffer, __global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __local AtomData* restrict localData, __local float4* restrict tempBuffer,
unsigned int startTileIndex, unsigned int endTileIndex, unsigned int startTileIndex, unsigned int endTileIndex,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
#else #else
unsigned int numTiles unsigned int numTiles
#endif #endif
......
...@@ -16,15 +16,15 @@ typedef struct { ...@@ -16,15 +16,15 @@ typedef struct {
*/ */
__kernel void computeNonbonded( __kernel void computeNonbonded(
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
__global long* forceBuffers, __global long* restrict forceBuffers,
#else #else
__global float4* forceBuffers, __global float4* restrict forceBuffers,
#endif #endif
__global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions, __global float* restrict energyBuffer, __global const float4* restrict posq, __global const unsigned int* restrict exclusions,
__global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __local AtomData* localData, __local float* tempBuffer, __global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __local AtomData* restrict localData, __local float* restrict tempBuffer,
unsigned int startTileIndex, unsigned int endTileIndex, unsigned int startTileIndex, unsigned int endTileIndex,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
#else #else
unsigned int numTiles unsigned int numTiles
#endif #endif
......
__kernel void updateBsplines(__global float4* posq, __global float4* pmeBsplineTheta, __local float4* bsplinesCache, __global int2* pmeAtomGridIndex, float4 periodicBoxSize, float4 invPeriodicBoxSize) { __kernel void updateBsplines(__global const float4* restrict posq, __global float4* restrict pmeBsplineTheta, __local float4* restrict bsplinesCache,
__global int2* restrict pmeAtomGridIndex, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
const float4 scale = 1.0f/(PME_ORDER-1); const float4 scale = 1.0f/(PME_ORDER-1);
for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0)) { for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0)) {
__local float4* data = &bsplinesCache[get_local_id(0)*PME_ORDER]; __local float4* data = &bsplinesCache[get_local_id(0)*PME_ORDER];
...@@ -38,7 +39,7 @@ __kernel void updateBsplines(__global float4* posq, __global float4* pmeBsplineT ...@@ -38,7 +39,7 @@ __kernel void updateBsplines(__global float4* posq, __global float4* pmeBsplineT
/** /**
* For each grid point, find the range of sorted atoms associated with that point. * For each grid point, find the range of sorted atoms associated with that point.
*/ */
__kernel void findAtomRangeForGrid(__global int2* pmeAtomGridIndex, __global int* pmeAtomRange, __global float4* posq, float4 periodicBoxSize, float4 invPeriodicBoxSize) { __kernel void findAtomRangeForGrid(__global int2* restrict pmeAtomGridIndex, __global int* restrict pmeAtomRange, __global const float4* restrict posq, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
int start = (NUM_ATOMS*get_global_id(0))/get_global_size(0); int start = (NUM_ATOMS*get_global_id(0))/get_global_size(0);
int end = (NUM_ATOMS*(get_global_id(0)+1))/get_global_size(0); int end = (NUM_ATOMS*(get_global_id(0)+1))/get_global_size(0);
int last = (start == 0 ? -1 : pmeAtomGridIndex[start-1].y); int last = (start == 0 ? -1 : pmeAtomGridIndex[start-1].y);
...@@ -75,7 +76,8 @@ __kernel void findAtomRangeForGrid(__global int2* pmeAtomGridIndex, __global int ...@@ -75,7 +76,8 @@ __kernel void findAtomRangeForGrid(__global int2* pmeAtomGridIndex, __global int
#define BUFFER_SIZE (PME_ORDER*PME_ORDER*PME_ORDER) #define BUFFER_SIZE (PME_ORDER*PME_ORDER*PME_ORDER)
__kernel __attribute__((reqd_work_group_size(BUFFER_SIZE, 1, 1))) __kernel __attribute__((reqd_work_group_size(BUFFER_SIZE, 1, 1)))
__kernel void gridSpreadCharge(__global float4* posq, __global int2* pmeAtomGridIndex, __global int* pmeAtomRange, __global long* pmeGrid, __global float4* pmeBsplineTheta, float4 periodicBoxSize, float4 invPeriodicBoxSize) { void gridSpreadCharge(__global const float4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange,
__global long* restrict pmeGrid, __global const float4* restrict pmeBsplineTheta, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
int ix = get_local_id(0)/(PME_ORDER*PME_ORDER); int ix = get_local_id(0)/(PME_ORDER*PME_ORDER);
int remainder = get_local_id(0)-ix*PME_ORDER*PME_ORDER; int remainder = get_local_id(0)-ix*PME_ORDER*PME_ORDER;
int iy = remainder/PME_ORDER; int iy = remainder/PME_ORDER;
...@@ -122,7 +124,7 @@ __kernel void gridSpreadCharge(__global float4* posq, __global int2* pmeAtomGrid ...@@ -122,7 +124,7 @@ __kernel void gridSpreadCharge(__global float4* posq, __global int2* pmeAtomGrid
} }
} }
__kernel void finishSpreadCharge(__global long* pmeGrid) { __kernel void finishSpreadCharge(__global long* restrict pmeGrid) {
__global float2* floatGrid = (__global float2*) pmeGrid; __global float2* floatGrid = (__global float2*) pmeGrid;
const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z; const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
float scale = EPSILON_FACTOR/(float) 0xFFFFFFFF; float scale = EPSILON_FACTOR/(float) 0xFFFFFFFF;
...@@ -133,7 +135,8 @@ __kernel void finishSpreadCharge(__global long* pmeGrid) { ...@@ -133,7 +135,8 @@ __kernel void finishSpreadCharge(__global long* pmeGrid) {
} }
} }
#else #else
__kernel void gridSpreadCharge(__global float4* posq, __global int2* pmeAtomGridIndex, __global int* pmeAtomRange, __global float2* pmeGrid, __global float4* pmeBsplineTheta) { __kernel void gridSpreadCharge(__global const float4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange,
__global float2* restrict pmeGrid, __global const float4* restrict pmeBsplineTheta) {
unsigned int numGridPoints = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z; unsigned int numGridPoints = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
for (int gridIndex = get_global_id(0); gridIndex < numGridPoints; gridIndex += get_global_size(0)) { for (int gridIndex = get_global_id(0); gridIndex < numGridPoints; gridIndex += get_global_size(0)) {
// Compute the charge on a grid point. // Compute the charge on a grid point.
...@@ -190,8 +193,8 @@ __kernel void gridSpreadCharge(__global float4* posq, __global int2* pmeAtomGrid ...@@ -190,8 +193,8 @@ __kernel void gridSpreadCharge(__global float4* posq, __global int2* pmeAtomGrid
} }
#endif #endif
__kernel void reciprocalConvolution(__global float2* pmeGrid, __global float* energyBuffer, __global float* pmeBsplineModuliX, __kernel void reciprocalConvolution(__global float2* restrict pmeGrid, __global float* restrict energyBuffer, __global const float* restrict pmeBsplineModuliX,
__global float* pmeBsplineModuliY, __global float* pmeBsplineModuliZ, float4 invPeriodicBoxSize, float recipScaleFactor) { __global const float* restrict pmeBsplineModuliY, __global const float* restrict pmeBsplineModuliZ, float4 invPeriodicBoxSize, float recipScaleFactor) {
const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z; const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
float energy = 0.0f; float energy = 0.0f;
for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) { for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
...@@ -220,7 +223,8 @@ __kernel void reciprocalConvolution(__global float2* pmeGrid, __global float* en ...@@ -220,7 +223,8 @@ __kernel void reciprocalConvolution(__global float2* pmeGrid, __global float* en
energyBuffer[get_global_id(0)] += 0.5f*energy; energyBuffer[get_global_id(0)] += 0.5f*energy;
} }
__kernel void gridInterpolateForce(__global float4* posq, __global float4* forceBuffers, __global float2* pmeGrid, float4 periodicBoxSize, float4 invPeriodicBoxSize, __local float4* bsplinesCache) { __kernel void gridInterpolateForce(__global const float4* restrict posq, __global float4* restrict forceBuffers, __global const float2* restrict pmeGrid,
float4 periodicBoxSize, float4 invPeriodicBoxSize, __local float4* restrict bsplinesCache) {
const float4 scale = 1.0f/(PME_ORDER-1); const float4 scale = 1.0f/(PME_ORDER-1);
__local float4* data = &bsplinesCache[get_local_id(0)*PME_ORDER]; __local float4* data = &bsplinesCache[get_local_id(0)*PME_ORDER];
__local float4* ddata = &bsplinesCache[get_local_id(0)*PME_ORDER + get_local_size(0)*PME_ORDER]; __local float4* ddata = &bsplinesCache[get_local_id(0)*PME_ORDER + get_local_size(0)*PME_ORDER];
......
__kernel void updateBsplines(__global float4* posq, __global float4* pmeBsplineTheta, __global float4* pmeBsplineDTheta, __local float4* bsplinesCache, __global int2* pmeAtomGridIndex, float4 periodicBoxSize, float4 invPeriodicBoxSize) { __kernel void updateBsplines(__global const float4* restrict posq, __global float4* restrict pmeBsplineTheta, __global float4* restrict pmeBsplineDTheta, __local float4* restrict bsplinesCache, __global int2* restrict pmeAtomGridIndex, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
const float4 scale = 1.0f/(PME_ORDER-1); const float4 scale = 1.0f/(PME_ORDER-1);
for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0)) { for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0)) {
__local float4* data = &bsplinesCache[get_local_id(0)*PME_ORDER]; __local float4* data = &bsplinesCache[get_local_id(0)*PME_ORDER];
...@@ -42,10 +42,10 @@ __kernel void updateBsplines(__global float4* posq, __global float4* pmeBsplineT ...@@ -42,10 +42,10 @@ __kernel void updateBsplines(__global float4* posq, __global float4* pmeBsplineT
/** /**
* This kernel is not actually used when running on a CPU. * This kernel is not actually used when running on a CPU.
*/ */
__kernel void findAtomRangeForGrid(__global int2* pmeAtomGridIndex, __global int* pmeAtomRange, __global float4* posq, float4 periodicBoxSize, float4 invPeriodicBoxSize) { __kernel void findAtomRangeForGrid(__global const int2* restrict pmeAtomGridIndex, __global int* restrict pmeAtomRange, __global const float4* restrict posq, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
} }
__kernel void gridSpreadCharge(__global float4* posq, __global int2* pmeAtomGridIndex, __global int* pmeAtomRange, __global float2* pmeGrid, __global float4* pmeBsplineTheta, float4 periodicBoxSize, float4 invPeriodicBoxSize) { __kernel void gridSpreadCharge(__global const float4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange, __global float2* restrict pmeGrid, __global const float4* restrict pmeBsplineTheta, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
const int firstx = get_global_id(0)*GRID_SIZE_X/get_global_size(0); const int firstx = get_global_id(0)*GRID_SIZE_X/get_global_size(0);
const int lastx = (get_global_id(0)+1)*GRID_SIZE_X/get_global_size(0); const int lastx = (get_global_id(0)+1)*GRID_SIZE_X/get_global_size(0);
for (int gridIndex = firstx*GRID_SIZE_Y*GRID_SIZE_Z; gridIndex < lastx*GRID_SIZE_Y*GRID_SIZE_Z; gridIndex++) for (int gridIndex = firstx*GRID_SIZE_Y*GRID_SIZE_Z; gridIndex < lastx*GRID_SIZE_Y*GRID_SIZE_Z; gridIndex++)
...@@ -82,8 +82,8 @@ __kernel void gridSpreadCharge(__global float4* posq, __global int2* pmeAtomGrid ...@@ -82,8 +82,8 @@ __kernel void gridSpreadCharge(__global float4* posq, __global int2* pmeAtomGrid
} }
} }
__kernel void reciprocalConvolution(__global float2* pmeGrid, __global float* energyBuffer, __global float* pmeBsplineModuliX, __kernel void reciprocalConvolution(__global float2* restrict pmeGrid, __global float* restrict energyBuffer, __global const float* restrict pmeBsplineModuliX,
__global float* pmeBsplineModuliY, __global float* pmeBsplineModuliZ, float4 invPeriodicBoxSize, float recipScaleFactor) { __global const float* restrict pmeBsplineModuliY, __global const float* restrict pmeBsplineModuliZ, float4 invPeriodicBoxSize, float recipScaleFactor) {
const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z; const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
float energy = 0.0f; float energy = 0.0f;
for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) { for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
...@@ -112,7 +112,7 @@ __kernel void reciprocalConvolution(__global float2* pmeGrid, __global float* en ...@@ -112,7 +112,7 @@ __kernel void reciprocalConvolution(__global float2* pmeGrid, __global float* en
energyBuffer[get_global_id(0)] += 0.5f*energy; energyBuffer[get_global_id(0)] += 0.5f*energy;
} }
__kernel void gridInterpolateForce(__global float4* posq, __global float4* forceBuffers, __global float4* pmeBsplineTheta, __global float4* pmeBsplineDTheta, __global float2* pmeGrid, float4 periodicBoxSize, float4 invPeriodicBoxSize) { __kernel void gridInterpolateForce(__global const float4* restrict posq, __global float4* restrict forceBuffers, __global const float4* restrict pmeBsplineTheta, __global const float4* restrict pmeBsplineDTheta, __global const float2* restrict pmeGrid, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
for (int atom = get_global_id(0); atom < NUM_ATOMS; atom += get_global_size(0)) { for (int atom = get_global_id(0); atom < NUM_ATOMS; atom += get_global_size(0)) {
float4 force = 0.0f; float4 force = 0.0f;
float4 pos = posq[atom]; float4 pos = posq[atom];
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
* Generate random numbers * Generate random numbers
*/ */
__kernel void generateRandomNumbers(int numValues, __global float4* random, __global uint4* seed) { __kernel void generateRandomNumbers(int numValues, __global float4* restrict random, __global uint4* restrict seed) {
int index = get_global_id(0); int index = get_global_id(0);
uint4 state = seed[index]; uint4 state = seed[index];
unsigned int carry = 0; unsigned int carry = 0;
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
* Calculate the center of mass momentum. * Calculate the center of mass momentum.
*/ */
__kernel void calcCenterOfMassMomentum(int numAtoms, __global float4* velm, __global float4* cmMomentum, __local float4* temp) { __kernel void calcCenterOfMassMomentum(int numAtoms, __global const float4* restrict velm, __global float4* restrict cmMomentum, __local float4* restrict temp) {
int index = get_global_id(0); int index = get_global_id(0);
float4 cm = 0.0f; float4 cm = 0.0f;
while (index < numAtoms) { while (index < numAtoms) {
...@@ -53,7 +53,7 @@ __kernel void calcCenterOfMassMomentum(int numAtoms, __global float4* velm, __gl ...@@ -53,7 +53,7 @@ __kernel void calcCenterOfMassMomentum(int numAtoms, __global float4* velm, __gl
* Remove center of mass motion. * Remove center of mass motion.
*/ */
__kernel void removeCenterOfMassMomentum(int numAtoms, __global float4* velm, __global float4* cmMomentum, __local float4* temp) { __kernel void removeCenterOfMassMomentum(int numAtoms, __global float4* restrict velm, __global const float4* restrict cmMomentum, __local float4* restrict temp) {
// First sum all of the momenta that were calculated by individual groups. // First sum all of the momenta that were calculated by individual groups.
int index = get_local_id(0); int index = get_local_id(0);
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
* Enforce constraints on SETTLE clusters * Enforce constraints on SETTLE clusters
*/ */
__kernel void applySettle(int numClusters, float tol, __global float4* oldPos, __global float4* posDelta, __global float4* newDelta, __global float4* velm, __global int4* clusterAtoms, __global float2* clusterParams) { __kernel void applySettle(int numClusters, float tol, __global const float4* restrict oldPos, __global const float4* restrict posDelta, __global float4* restrict newDelta, __global const float4* restrict velm, __global const int4* restrict clusterAtoms, __global const float2* restrict clusterParams) {
int index = get_global_id(0); int index = get_global_id(0);
while (index < numClusters) { while (index < numClusters) {
// Load the data for this cluster. // Load the data for this cluster.
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
* Enforce constraints on SHAKE clusters * Enforce constraints on SHAKE clusters
*/ */
__kernel void applyShakeToHydrogens(int numClusters, float tol, __global float4* oldPos, __global float4* posDelta, __global float4* newDelta, __global int4* clusterAtoms, __global float4* clusterParams) { __kernel void applyShakeToHydrogens(int numClusters, float tol, __global const float4* restrict oldPos, __global const float4* restrict posDelta, __global float4* restrict newDelta, __global const int4* restrict clusterAtoms, __global const float4* restrict clusterParams) {
int index = get_global_id(0); int index = get_global_id(0);
while (index < numClusters) { while (index < numClusters) {
// Load the data for this cluster. // Load the data for this cluster.
......
...@@ -8,7 +8,7 @@ float getValue(TYPE value) { ...@@ -8,7 +8,7 @@ float getValue(TYPE value) {
* Calculate the minimum and maximum value in the array to be sorted. This kernel * Calculate the minimum and maximum value in the array to be sorted. This kernel
* is executed as a single work group. * is executed as a single work group.
*/ */
__kernel void computeRange(__global TYPE* data, int length, __global float2* range, __local float* buffer) { __kernel void computeRange(__global const TYPE* restrict data, int length, __global float2* restrict range, __local float* restrict buffer) {
float minimum = MAXFLOAT; float minimum = MAXFLOAT;
float maximum = -MAXFLOAT; float maximum = -MAXFLOAT;
...@@ -45,8 +45,8 @@ __kernel void computeRange(__global TYPE* data, int length, __global float2* ran ...@@ -45,8 +45,8 @@ __kernel void computeRange(__global TYPE* data, int length, __global float2* ran
/** /**
* Assign elements to buckets. * Assign elements to buckets.
*/ */
__kernel void assignElementsToBuckets(__global TYPE* data, int length, int numBuckets, __global float2* range, __kernel void assignElementsToBuckets(__global const TYPE* restrict data, int length, int numBuckets, __global const float2* restrict range,
__global int* bucketOffset, __global int* bucketOfElement, __global int* offsetInBucket) { __global int* bucketOffset, __global int* restrict bucketOfElement, __global int* restrict offsetInBucket) {
#ifdef AMD_ATOMIC_WORK_AROUND #ifdef AMD_ATOMIC_WORK_AROUND
// Do a byte write to force all memory accesses to interactionCount to use the complete path. // Do a byte write to force all memory accesses to interactionCount to use the complete path.
// This avoids the atomic access from causing all word accesses to other buffers from using the slow complete path. // This avoids the atomic access from causing all word accesses to other buffers from using the slow complete path.
...@@ -72,7 +72,7 @@ __kernel void assignElementsToBuckets(__global TYPE* data, int length, int numBu ...@@ -72,7 +72,7 @@ __kernel void assignElementsToBuckets(__global TYPE* data, int length, int numBu
* Sum the bucket sizes to compute the start position of each bucket. This kernel * Sum the bucket sizes to compute the start position of each bucket. This kernel
* is executed as a single work group. * is executed as a single work group.
*/ */
__kernel void computeBucketPositions(int numBuckets, __global int* bucketOffset, __local int* buffer) { __kernel void computeBucketPositions(int numBuckets, __global int* restrict bucketOffset, __local int* restrict buffer) {
int globalOffset = 0; int globalOffset = 0;
for (int startBucket = 0; startBucket < numBuckets; startBucket += get_local_size(0)) { for (int startBucket = 0; startBucket < numBuckets; startBucket += get_local_size(0)) {
// Load the bucket sizes into local memory. // Load the bucket sizes into local memory.
...@@ -101,7 +101,7 @@ __kernel void computeBucketPositions(int numBuckets, __global int* bucketOffset, ...@@ -101,7 +101,7 @@ __kernel void computeBucketPositions(int numBuckets, __global int* bucketOffset,
/** /**
* Copy the input data into the buckets for sorting. * Copy the input data into the buckets for sorting.
*/ */
__kernel void copyDataToBuckets(__global TYPE* data, __global TYPE* buckets, int length, __global int* bucketOffset, __global int* bucketOfElement, __global int* offsetInBucket) { __kernel void copyDataToBuckets(__global const TYPE* restrict data, __global TYPE* restrict buckets, int length, __global const int* restrict bucketOffset, __global const int* restrict bucketOfElement, __global const int* restrict offsetInBucket) {
for (int index = get_global_id(0); index < length; index += get_global_size(0)) { for (int index = get_global_id(0); index < length; index += get_global_size(0)) {
TYPE element = data[index]; TYPE element = data[index];
int bucketIndex = bucketOfElement[index]; int bucketIndex = bucketOfElement[index];
...@@ -113,7 +113,7 @@ __kernel void copyDataToBuckets(__global TYPE* data, __global TYPE* buckets, int ...@@ -113,7 +113,7 @@ __kernel void copyDataToBuckets(__global TYPE* data, __global TYPE* buckets, int
/** /**
* Sort the data in each bucket. * Sort the data in each bucket.
*/ */
__kernel void sortBuckets(__global TYPE* data, __global TYPE* buckets, int numBuckets, __global int* bucketOffset, __local TYPE* buffer) { __kernel void sortBuckets(__global TYPE* restrict data, __global const TYPE* restrict buckets, int numBuckets, __global const int* restrict bucketOffset, __local TYPE* restrict buffer) {
for (int index = get_group_id(0); index < numBuckets; index += get_num_groups(0)) { for (int index = get_group_id(0); index < numBuckets; index += get_num_groups(0)) {
int startIndex = (index == 0 ? 0 : bucketOffset[index-1]); int startIndex = (index == 0 ? 0 : bucketOffset[index-1]);
int endIndex = bucketOffset[index]; int endIndex = bucketOffset[index];
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
* Fill a buffer with 0. * Fill a buffer with 0.
*/ */
__kernel void clearBuffer(__global int* buffer, int size) { __kernel void clearBuffer(__global int* restrict buffer, int size) {
int index = get_global_id(0); int index = get_global_id(0);
__global int4* buffer4 = (__global int4*) buffer; __global int4* buffer4 = (__global int4*) buffer;
int sizeDiv4 = size/4; int sizeDiv4 = size/4;
...@@ -18,7 +18,7 @@ __kernel void clearBuffer(__global int* buffer, int size) { ...@@ -18,7 +18,7 @@ __kernel void clearBuffer(__global int* buffer, int size) {
/** /**
* Fill two buffers with 0. * Fill two buffers with 0.
*/ */
__kernel void clearTwoBuffers(__global int* buffer1, int size1, __global int* buffer2, int size2) { __kernel void clearTwoBuffers(__global int* restrict buffer1, int size1, __global int* restrict buffer2, int size2) {
clearBuffer(buffer1, size1); clearBuffer(buffer1, size1);
clearBuffer(buffer2, size2); clearBuffer(buffer2, size2);
} }
...@@ -26,7 +26,7 @@ __kernel void clearTwoBuffers(__global int* buffer1, int size1, __global int* bu ...@@ -26,7 +26,7 @@ __kernel void clearTwoBuffers(__global int* buffer1, int size1, __global int* bu
/** /**
* Fill three buffers with 0. * Fill three buffers with 0.
*/ */
__kernel void clearThreeBuffers(__global int* buffer1, int size1, __global int* buffer2, int size2, __global int* buffer3, int size3) { __kernel void clearThreeBuffers(__global int* restrict buffer1, int size1, __global int* restrict buffer2, int size2, __global int* restrict buffer3, int size3) {
clearBuffer(buffer1, size1); clearBuffer(buffer1, size1);
clearBuffer(buffer2, size2); clearBuffer(buffer2, size2);
clearBuffer(buffer3, size3); clearBuffer(buffer3, size3);
...@@ -35,7 +35,7 @@ __kernel void clearThreeBuffers(__global int* buffer1, int size1, __global int* ...@@ -35,7 +35,7 @@ __kernel void clearThreeBuffers(__global int* buffer1, int size1, __global int*
/** /**
* Fill four buffers with 0. * Fill four buffers with 0.
*/ */
__kernel void clearFourBuffers(__global int* buffer1, int size1, __global int* buffer2, int size2, __global int* buffer3, int size3, __global int* buffer4, int size4) { __kernel void clearFourBuffers(__global int* restrict buffer1, int size1, __global int* restrict buffer2, int size2, __global int* restrict buffer3, int size3, __global int* restrict buffer4, int size4) {
clearBuffer(buffer1, size1); clearBuffer(buffer1, size1);
clearBuffer(buffer2, size2); clearBuffer(buffer2, size2);
clearBuffer(buffer3, size3); clearBuffer(buffer3, size3);
...@@ -45,7 +45,7 @@ __kernel void clearFourBuffers(__global int* buffer1, int size1, __global int* b ...@@ -45,7 +45,7 @@ __kernel void clearFourBuffers(__global int* buffer1, int size1, __global int* b
/** /**
* Fill five buffers with 0. * Fill five buffers with 0.
*/ */
__kernel void clearFiveBuffers(__global int* buffer1, int size1, __global int* buffer2, int size2, __global int* buffer3, int size3, __global int* buffer4, int size4, __global int* buffer5, int size5) { __kernel void clearFiveBuffers(__global int* restrict buffer1, int size1, __global int* restrict buffer2, int size2, __global int* restrict buffer3, int size3, __global int* restrict buffer4, int size4, __global int* restrict buffer5, int size5) {
clearBuffer(buffer1, size1); clearBuffer(buffer1, size1);
clearBuffer(buffer2, size2); clearBuffer(buffer2, size2);
clearBuffer(buffer3, size3); clearBuffer(buffer3, size3);
...@@ -56,7 +56,7 @@ __kernel void clearFiveBuffers(__global int* buffer1, int size1, __global int* b ...@@ -56,7 +56,7 @@ __kernel void clearFiveBuffers(__global int* buffer1, int size1, __global int* b
/** /**
* Fill six buffers with 0. * Fill six buffers with 0.
*/ */
__kernel void clearSixBuffers(__global int* buffer1, int size1, __global int* buffer2, int size2, __global int* buffer3, int size3, __global int* buffer4, int size4, __global int* buffer5, int size5, __global int* buffer6, int size6) { __kernel void clearSixBuffers(__global int* restrict buffer1, int size1, __global int* restrict buffer2, int size2, __global int* restrict buffer3, int size3, __global int* restrict buffer4, int size4, __global int* restrict buffer5, int size5, __global int* restrict buffer6, int size6) {
clearBuffer(buffer1, size1); clearBuffer(buffer1, size1);
clearBuffer(buffer2, size2); clearBuffer(buffer2, size2);
clearBuffer(buffer3, size3); clearBuffer(buffer3, size3);
...@@ -69,7 +69,7 @@ __kernel void clearSixBuffers(__global int* buffer1, int size1, __global int* bu ...@@ -69,7 +69,7 @@ __kernel void clearSixBuffers(__global int* buffer1, int size1, __global int* bu
* Sum a collection of buffers into the first one. * Sum a collection of buffers into the first one.
*/ */
__kernel void reduceFloat4Buffer(__global float4* buffer, int bufferSize, int numBuffers) { __kernel void reduceFloat4Buffer(__global float4* restrict buffer, int bufferSize, int numBuffers) {
int index = get_global_id(0); int index = get_global_id(0);
int totalSize = bufferSize*numBuffers; int totalSize = bufferSize*numBuffers;
while (index < bufferSize) { while (index < bufferSize) {
...@@ -84,7 +84,7 @@ __kernel void reduceFloat4Buffer(__global float4* buffer, int bufferSize, int nu ...@@ -84,7 +84,7 @@ __kernel void reduceFloat4Buffer(__global float4* buffer, int bufferSize, int nu
/** /**
* Sum the various buffers containing forces. * Sum the various buffers containing forces.
*/ */
__kernel void reduceForces(__global long* longBuffer, __global float4* buffer, int bufferSize, int numBuffers) { __kernel void reduceForces(__global const long* restrict longBuffer, __global float4* restrict buffer, int bufferSize, int numBuffers) {
int totalSize = bufferSize*numBuffers; int totalSize = bufferSize*numBuffers;
float scale = 1.0f/(float) 0xFFFFFFFF; float scale = 1.0f/(float) 0xFFFFFFFF;
for (int index = get_global_id(0); index < bufferSize; index += get_global_size(0)) { for (int index = get_global_id(0); index < bufferSize; index += get_global_size(0)) {
...@@ -99,7 +99,7 @@ __kernel void reduceForces(__global long* longBuffer, __global float4* buffer, i ...@@ -99,7 +99,7 @@ __kernel void reduceForces(__global long* longBuffer, __global float4* buffer, i
* This is called to determine the accuracy of various native functions. * This is called to determine the accuracy of various native functions.
*/ */
__kernel void determineNativeAccuracy(__global float8* values, int numValues) { __kernel void determineNativeAccuracy(__global float8* restrict values, int numValues) {
for (int i = get_global_id(0); i < numValues; i += get_global_size(0)) { for (int i = get_global_id(0); i < numValues; i += get_global_size(0)) {
float v = values[i].s0; float v = values[i].s0;
values[i] = (float8) (v, native_sqrt(v), native_rsqrt(v), native_recip(v), native_exp(v), native_log(v), 0.0f, 0.0f); values[i] = (float8) (v, native_sqrt(v), native_rsqrt(v), native_recip(v), native_exp(v), native_log(v), 0.0f, 0.0f);
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Perform the first step of verlet integration. * Perform the first step of verlet integration.
*/ */
__kernel void integrateVerletPart1(int numAtoms, __global float2* dt, __global float4* posq, __global float4* velm, __global float4* force, __global float4* posDelta) { __kernel void integrateVerletPart1(int numAtoms, __global const float2* restrict dt, __global const float4* restrict posq, __global float4* restrict velm, __global const float4* restrict force, __global float4* restrict posDelta) {
float2 stepSize = dt[0]; float2 stepSize = dt[0];
float dtPos = stepSize.y; float dtPos = stepSize.y;
float dtVel = 0.5f*(stepSize.x+stepSize.y); float dtVel = 0.5f*(stepSize.x+stepSize.y);
...@@ -26,7 +26,7 @@ __kernel void integrateVerletPart1(int numAtoms, __global float2* dt, __global f ...@@ -26,7 +26,7 @@ __kernel void integrateVerletPart1(int numAtoms, __global float2* dt, __global f
* Perform the second step of verlet integration. * Perform the second step of verlet integration.
*/ */
__kernel void integrateVerletPart2(int numAtoms, __global float2* dt, __global float4* posq, __global float4* velm, __global float4* posDelta) { __kernel void integrateVerletPart2(int numAtoms, __global float2* restrict dt, __global float4* restrict posq, __global float4* restrict velm, __global const float4* restrict posDelta) {
float2 stepSize = dt[0]; float2 stepSize = dt[0];
#ifdef cl_khr_fp64 #ifdef cl_khr_fp64
double oneOverDt = 1.0/stepSize.y; double oneOverDt = 1.0/stepSize.y;
...@@ -57,7 +57,7 @@ __kernel void integrateVerletPart2(int numAtoms, __global float2* dt, __global f ...@@ -57,7 +57,7 @@ __kernel void integrateVerletPart2(int numAtoms, __global float2* dt, __global f
* Select the step size to use for the next step. * Select the step size to use for the next step.
*/ */
__kernel void selectVerletStepSize(int numAtoms, float maxStepSize, float errorTol, __global float2* dt, __global float4* velm, __global float4* force, __local float* error) { __kernel void selectVerletStepSize(int numAtoms, float maxStepSize, float errorTol, __global float2* restrict dt, __global const float4* restrict velm, __global const float4* restrict force, __local float* restrict error) {
// Calculate the error. // Calculate the error.
float err = 0.0f; float err = 0.0f;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment