Commit 69e75377 authored by Peter Eastman's avatar Peter Eastman
Browse files

Added "const" and "restrict" to lots of kernel arguments to let the compiler do more optimizations

parent bf8b9f30
......@@ -154,12 +154,12 @@ void OpenCLBondedUtilities::initialize(const System& system) {
const vector<int>& set = *iter;
int setSize = set.size();
stringstream s;
s<<"__kernel void computeBondedForces(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq";
s<<"__kernel void computeBondedForces(__global float4* restrict forceBuffers, __global float* restrict energyBuffer, __global const float4* restrict posq";
for (int i = 0; i < setSize; i++) {
int force = set[i];
string indexType = "uint"+(indexWidth[force] == 1 ? "" : OpenCLExpressionUtilities::intToString(indexWidth[force]));
s<<", __global "<<indexType<<"* atomIndices"<<i;
s<<", __global "<<indexType<<"* bufferIndices"<<i;
s<<", __global const "<<indexType<<"* restrict atomIndices"<<i;
s<<", __global const "<<indexType<<"* restrict bufferIndices"<<i;
}
for (int i = 0; i < (int) arguments.size(); i++)
s<<", __global "<<argTypes[i]<<"* customArg"<<(i+1);
......
......@@ -1739,7 +1739,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
tabulatedFunctions.push_back(new OpenCLArray<mm_float4>(cl, values.size()-1, "TabulatedFunction"));
tabulatedFunctions[tabulatedFunctions.size()-1]->upload(f);
cl.getNonbondedUtilities().addArgument(OpenCLNonbondedUtilities::ParameterInfo(arrayName, "float", 4, sizeof(cl_float4), tabulatedFunctions[tabulatedFunctions.size()-1]->getDeviceBuffer()));
tableArgs << ", __global float4* " << arrayName;
tableArgs << ", __global const float4* restrict " << arrayName;
}
if (force.getNumFunctions() > 0) {
tabulatedFunctionParams = new OpenCLArray<mm_float4>(cl, tabulatedFunctionParamsVec.size(), "tabulatedFunctionParameters", false, CL_MEM_READ_ONLY);
......@@ -1837,7 +1837,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
for (int i = 0; i < (int) params->getBuffers().size(); i++) {
const OpenCLNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i];
string paramName = "params"+intToString(i+1);
extraArgs << ", __global " << buffer.getType() << "* global_" << paramName << ", __local " << buffer.getType() << "* local_" << paramName;
extraArgs << ", __global const " << buffer.getType() << "* restrict global_" << paramName << ", __local " << buffer.getType() << "* restrict local_" << paramName;
loadLocal1 << "local_" << paramName << "[localAtomIndex] = " << paramName << "1;\n";
loadLocal2 << "local_" << paramName << "[localAtomIndex] = global_" << paramName << "[j];\n";
load1 << buffer.getType() << " " << paramName << "1 = global_" << paramName << "[atom1];\n";
......@@ -1884,12 +1884,12 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
for (int i = 0; i < (int) params->getBuffers().size(); i++) {
const OpenCLNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i];
string paramName = "params"+intToString(i+1);
extraArgs << ", __global " << buffer.getType() << "* " << paramName;
extraArgs << ", __global const " << buffer.getType() << "* restrict " << paramName;
}
for (int i = 0; i < (int) computedValues->getBuffers().size(); i++) {
const OpenCLNonbondedUtilities::ParameterInfo& buffer = computedValues->getBuffers()[i];
string valueName = "values"+intToString(i+1);
extraArgs << ", __global " << buffer.getType() << "* global_" << valueName;
extraArgs << ", __global " << buffer.getType() << "* restrict global_" << valueName;
reductionSource << buffer.getType() << " local_" << valueName << ";\n";
}
reductionSource << "local_values" << computedValues->getParameterSuffix(0) << " = sum;\n";
......@@ -1977,7 +1977,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
for (int i = 0; i < (int) params->getBuffers().size(); i++) {
const OpenCLNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i];
string paramName = "params"+intToString(i+1);
extraArgs << ", __global " << buffer.getType() << "* global_" << paramName << ", __local " << buffer.getType() << "* local_" << paramName;
extraArgs << ", __global const " << buffer.getType() << "* restrict global_" << paramName << ", __local " << buffer.getType() << "* restrict local_" << paramName;
loadLocal1 << "local_" << paramName << "[localAtomIndex] = " << paramName << "1;\n";
loadLocal2 << "local_" << paramName << "[localAtomIndex] = global_" << paramName << "[j];\n";
load1 << buffer.getType() << " " << paramName << "1 = global_" << paramName << "[atom1];\n";
......@@ -1986,17 +1986,17 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
for (int i = 0; i < (int) computedValues->getBuffers().size(); i++) {
const OpenCLNonbondedUtilities::ParameterInfo& buffer = computedValues->getBuffers()[i];
string valueName = "values"+intToString(i+1);
extraArgs << ", __global " << buffer.getType() << "* global_" << valueName << ", __local " << buffer.getType() << "* local_" << valueName;
extraArgs << ", __global const " << buffer.getType() << "* restrict global_" << valueName << ", __local " << buffer.getType() << "* restrict local_" << valueName;
loadLocal1 << "local_" << valueName << "[localAtomIndex] = " << valueName << "1;\n";
loadLocal2 << "local_" << valueName << "[localAtomIndex] = global_" << valueName << "[j];\n";
load1 << buffer.getType() << " " << valueName << "1 = global_" << valueName << "[atom1];\n";
load2 << buffer.getType() << " " << valueName << "2 = local_" << valueName << "[atom2];\n";
}
if (useLong) {
extraArgs << ", __global long* derivBuffers";
extraArgs << ", __global long* restrict derivBuffers";
for (int i = 0; i < force.getNumComputedValues(); i++) {
string index = intToString(i+1);
extraArgs << ", __local float* local_deriv" << index;
extraArgs << ", __local float* restrict local_deriv" << index;
clearLocal << "local_deriv" << index << "[localAtomIndex] = 0.0f;\n";
declare1 << "float deriv" << index << "_1 = 0.0f;\n";
load2 << "float deriv" << index << "_2 = 0.0f;\n";
......@@ -2011,7 +2011,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
for (int i = 0; i < (int) energyDerivs->getBuffers().size(); i++) {
const OpenCLNonbondedUtilities::ParameterInfo& buffer = energyDerivs->getBuffers()[i];
string index = intToString(i+1);
extraArgs << ", __global " << buffer.getType() << "* derivBuffers" << index << ", __local " << buffer.getType() << "* local_deriv" << index;
extraArgs << ", __global " << buffer.getType() << "* restrict derivBuffers" << index << ", __local " << buffer.getType() << "* restrict local_deriv" << index;
clearLocal << "local_deriv" << index << "[localAtomIndex] = 0.0f;\n";
declare1 << buffer.getType() << " deriv" << index << "_1 = 0.0f;\n";
load2 << buffer.getType() << " deriv" << index << "_2 = 0.0f;\n";
......@@ -2068,21 +2068,21 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
for (int i = 0; i < (int) params->getBuffers().size(); i++) {
const OpenCLNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i];
string paramName = "params"+intToString(i+1);
extraArgs << ", __global " << buffer.getType() << "* " << paramName;
extraArgs << ", __global const " << buffer.getType() << "* restrict " << paramName;
}
for (int i = 0; i < (int) computedValues->getBuffers().size(); i++) {
const OpenCLNonbondedUtilities::ParameterInfo& buffer = computedValues->getBuffers()[i];
string valueName = "values"+intToString(i+1);
extraArgs << ", __global " << buffer.getType() << "* " << valueName;
extraArgs << ", __global const " << buffer.getType() << "* restrict " << valueName;
}
for (int i = 0; i < (int) energyDerivs->getBuffers().size(); i++) {
const OpenCLNonbondedUtilities::ParameterInfo& buffer = energyDerivs->getBuffers()[i];
string index = intToString(i+1);
extraArgs << ", __global " << buffer.getType() << "* derivBuffers" << index;
extraArgs << ", __global " << buffer.getType() << "* restrict derivBuffers" << index;
compute << buffer.getType() << " deriv" << index << " = derivBuffers" << index << "[index];\n";
}
if (useLong) {
extraArgs << ", __global long* derivBuffersIn";
extraArgs << ", __global const long* restrict derivBuffersIn";
for (int i = 0; i < energyDerivs->getNumParameters(); ++i)
reduce << "derivBuffers" << energyDerivs->getParameterSuffix(i, "[index]") <<
" = (1.0f/0xFFFFFFFF)*derivBuffersIn[index+PADDED_NUM_ATOMS*" << intToString(i) << "];\n";
......@@ -2147,17 +2147,17 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
for (int i = 0; i < (int) params->getBuffers().size(); i++) {
const OpenCLNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i];
string paramName = "params"+intToString(i+1);
extraArgs << ", __global " << buffer.getType() << "* " << paramName;
extraArgs << ", __global const " << buffer.getType() << "* restrict " << paramName;
}
for (int i = 0; i < (int) computedValues->getBuffers().size(); i++) {
const OpenCLNonbondedUtilities::ParameterInfo& buffer = computedValues->getBuffers()[i];
string valueName = "values"+intToString(i+1);
extraArgs << ", __global " << buffer.getType() << "* " << valueName;
extraArgs << ", __global const " << buffer.getType() << "* restrict " << valueName;
}
for (int i = 0; i < (int) energyDerivs->getBuffers().size(); i++) {
const OpenCLNonbondedUtilities::ParameterInfo& buffer = energyDerivs->getBuffers()[i];
string index = intToString(i+1);
extraArgs << ", __global " << buffer.getType() << "* derivBuffers" << index;
extraArgs << ", __global " << buffer.getType() << "* restrict derivBuffers" << index;
compute << buffer.getType() << " deriv" << index << " = derivBuffers" << index << "[index];\n";
}
map<string, string> variables;
......@@ -2866,12 +2866,12 @@ void OpenCLCalcCustomHbondForceKernel::initialize(const System& system, const Cu
vector<mm_float4> f = OpenCLExpressionUtilities::computeFunctionCoefficients(values, min, max);
tabulatedFunctions.push_back(new OpenCLArray<mm_float4>(cl, values.size()-1, "TabulatedFunction"));
tabulatedFunctions[tabulatedFunctions.size()-1]->upload(f);
tableArgs << ", __global float4* " << arrayName;
tableArgs << ", __global const float4* restrict " << arrayName;
}
if (force.getNumFunctions() > 0) {
tabulatedFunctionParams = new OpenCLArray<mm_float4>(cl, tabulatedFunctionParamsVec.size(), "tabulatedFunctionParameters", false, CL_MEM_READ_ONLY);
tabulatedFunctionParams->upload(tabulatedFunctionParamsVec);
tableArgs << ", __global float4* functionParams";
tableArgs << ", __global const float4* restrict functionParams";
}
// Record information about parameters.
......@@ -2973,15 +2973,15 @@ void OpenCLCalcCustomHbondForceKernel::initialize(const System& system, const Cu
// Next it needs to load parameters from global memory.
if (force.getNumGlobalParameters() > 0)
extraArgs << ", __global float* globals";
extraArgs << ", __global const float* restrict globals";
for (int i = 0; i < (int) donorParams->getBuffers().size(); i++) {
const OpenCLNonbondedUtilities::ParameterInfo& buffer = donorParams->getBuffers()[i];
extraArgs << ", __global "+buffer.getType()+"* donor"+buffer.getName();
extraArgs << ", __global const "+buffer.getType()+"* restrict donor"+buffer.getName();
addDonorAndAcceptorCode(computeDonor, computeAcceptor, buffer.getType()+" donorParams"+intToString(i+1)+" = donor"+buffer.getName()+"[index];\n");
}
for (int i = 0; i < (int) acceptorParams->getBuffers().size(); i++) {
const OpenCLNonbondedUtilities::ParameterInfo& buffer = acceptorParams->getBuffers()[i];
extraArgs << ", __global "+buffer.getType()+"* acceptor"+buffer.getName();
extraArgs << ", __global const "+buffer.getType()+"* restrict acceptor"+buffer.getName();
addDonorAndAcceptorCode(computeDonor, computeAcceptor, buffer.getType()+" acceptorParams"+intToString(i+1)+" = acceptor"+buffer.getName()+"[index];\n");
}
......
......@@ -399,9 +399,9 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
replacements["ATOM_PARAMETER_DATA"] = localData.str();
stringstream args;
for (int i = 0; i < (int) params.size(); i++) {
args << ", __global ";
args << ", __global const ";
args << params[i].getType();
args << "* global_";
args << "* restrict global_";
args << params[i].getName();
}
for (int i = 0; i < (int) arguments.size(); i++) {
......@@ -411,11 +411,11 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
}
else {
if ((arguments[i].getMemory().getInfo<CL_MEM_FLAGS>() & CL_MEM_READ_ONLY) == 0)
args << ", __global ";
args << ", __global const ";
else
args << ", __constant ";
args << arguments[i].getType();
args << "* ";
args << "* restrict ";
args << arguments[i].getName();
}
}
......
......@@ -2,8 +2,8 @@
* Apply the Andersen thermostat to adjust particle velocities.
*/
__kernel void applyAndersenThermostat(float collisionFrequency, float kT, __global float4* velm, __global float2* stepSize, __global float4* random,
unsigned int randomIndex, __global int* atomGroups) {
__kernel void applyAndersenThermostat(float collisionFrequency, float kT, __global float4* velm, __global const float2* restrict stepSize, __global const float4* restrict random,
unsigned int randomIndex, __global const int* restrict atomGroups) {
float collisionProbability = 1.0f-exp(-collisionFrequency*stepSize[0].y);
float randomRange = erf(collisionProbability/sqrt(2.0f));
for (int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) {
......
......@@ -2,8 +2,8 @@
* Perform the first step of Brownian integration.
*/
__kernel void integrateBrownianPart1(float tauDeltaT, float noiseAmplitude, __global float4* force,
__global float4* posDelta, __global float4* velm, __global float4* random, unsigned int randomIndex) {
__kernel void integrateBrownianPart1(float tauDeltaT, float noiseAmplitude, __global const float4* restrict force,
__global float4* restrict posDelta, __global const float4* restrict velm, __global const float4* restrict random, unsigned int randomIndex) {
randomIndex += get_global_id(0);
for (int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) {
float invMass = velm[index].w;
......@@ -16,7 +16,7 @@ __kernel void integrateBrownianPart1(float tauDeltaT, float noiseAmplitude, __gl
* Perform the second step of Brownian integration.
*/
__kernel void integrateBrownianPart2(float oneOverDeltaT, __global float4* posq, __global float4* velm, __global float4* posDelta) {
__kernel void integrateBrownianPart2(float oneOverDeltaT, __global float4* posq, __global float4* velm, __global const float4* restrict posDelta) {
for (int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) {
float4 delta = posDelta[index];
velm[index].xyz = oneOverDeltaT*delta.xyz;
......
/**
* Compute the direction each constraint is pointing in. This is called once at the beginning of constraint evaluation.
*/
__kernel void computeConstraintDirections(__global int2* constraintAtoms, __global float4* constraintDistance, __global float4* atomPositions) {
__kernel void computeConstraintDirections(__global const int2* restrict constraintAtoms, __global float4* restrict constraintDistance, __global const float4* restrict atomPositions) {
for (int index = get_global_id(0); index < NUM_CONSTRAINTS; index += get_global_size(0)) {
// Compute the direction for this constraint.
......@@ -19,8 +19,8 @@ __kernel void computeConstraintDirections(__global int2* constraintAtoms, __glob
/**
* Compute the force applied by each constraint.
*/
__kernel void computeConstraintForce(__global int2* constraintAtoms, __global float4* constraintDistance, __global float4* atomPositions,
__global float* reducedMass, __global float* delta1, __global int* converged, float tol, int iteration) {
__kernel void computeConstraintForce(__global const int2* restrict constraintAtoms, __global const float4* restrict constraintDistance, __global const float4* restrict atomPositions,
__global const float* restrict reducedMass, __global float* restrict delta1, __global int* restrict converged, float tol, int iteration) {
__local int groupConverged;
if (converged[1-iteration%2]) {
if (get_global_id(0) == 0)
......@@ -58,8 +58,8 @@ __kernel void computeConstraintForce(__global int2* constraintAtoms, __global fl
/**
* Multiply the vector of constraint forces by the constraint matrix.
*/
__kernel void multiplyByConstraintMatrix(__global float* delta1, __global float* delta2, __global int* constraintMatrixColumn,
__global float* constraintMatrixValue, __global int* converged, int iteration) {
__kernel void multiplyByConstraintMatrix(__global const float* restrict delta1, __global float* restrict delta2, __global const int* restrict constraintMatrixColumn,
__global const float* restrict constraintMatrixValue, __global const int* restrict converged, int iteration) {
if (converged[iteration%2])
return; // The constraint iteration has already converged.
......@@ -81,8 +81,8 @@ __kernel void multiplyByConstraintMatrix(__global float* delta1, __global float*
/**
* Update the atom positions based on constraint forces.
*/
__kernel void updateAtomPositions(__global int* numAtomConstraints, __global int* atomConstraints, __global float4* constraintDistance,
__global float4* atomPositions, __global float4* velm, __global float* delta1, __global float* delta2, __global int* converged, int iteration) {
__kernel void updateAtomPositions(__global const int* restrict numAtomConstraints, __global const int* restrict atomConstraints, __global const float4* restrict constraintDistance,
__global float4* restrict atomPositions, __global const float4* restrict velm, __global const float* restrict delta1, __global const float* restrict delta2, __global int* restrict converged, int iteration) {
if (get_global_id(0) == 0)
converged[1-iteration%2] = 1;
if (converged[iteration%2])
......
......@@ -56,7 +56,7 @@ unsigned int sumReduce128(__local unsigned int* arr) {
return arr[0];
}
__kernel void countElts(__global unsigned int* dgBlockCounts, __global unsigned int* dgValid, const unsigned int len, __local unsigned int* dsCount) {
__kernel void countElts(__global unsigned int* restrict dgBlockCounts, __global const unsigned int* restrict dgValid, const unsigned int len, __local unsigned int* restrict dsCount) {
dsCount[get_local_id(0)] = 0;
unsigned int ub;
const unsigned int eltsPerBlock = len/get_num_groups(0) + ((len % get_num_groups(0)) ? 1 : 0);
......@@ -110,9 +110,9 @@ int compactSIMDPrefixSum(__local const unsigned int* dsData, __local const unsig
return numValid;
}
__kernel void moveValidElementsStaged(__global unsigned int* dgData, __global unsigned int* dgCompact, __global unsigned int* dgValid,
__global unsigned int* dgBlockCounts, unsigned int len, __global unsigned int* dNumValidElements,
__local unsigned int* inBlock, __local unsigned int* validBlock, __local unsigned int* compactBlock) {
__kernel void moveValidElementsStaged(__global const unsigned int* restrict dgData, __global unsigned int* restrict dgCompact, __global const unsigned int* restrict dgValid,
__global const unsigned int* restrict dgBlockCounts, unsigned int len, __global unsigned int* restrict dNumValidElements,
__local unsigned int* restrict inBlock, __local unsigned int* restrict validBlock, __local unsigned int* restrict compactBlock) {
__local unsigned int dsLocalIndex[256];
int blockOutOffset=0;
// Sum up the blockCounts before us to find our offset
......
......@@ -6,11 +6,11 @@
* Compute a force based on pair interactions.
*/
__kernel void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer, __local float4* local_force,
__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions, __global unsigned int* exclusionIndices,
__global unsigned int* exclusionRowIndices, __local float4* tempBuffer,
__kernel void computeN2Energy(__global float4* restrict forceBuffers, __global float* restrict energyBuffer, __local float4* restrict local_force,
__global const float4* restrict posq, __local float4* restrict local_posq, __global const unsigned int* restrict exclusions, __global const unsigned int* restrict exclusionIndices,
__global const unsigned int* restrict exclusionRowIndices, __local float4* restrict tempBuffer,
#ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
#else
unsigned int numTiles
#endif
......
......@@ -7,11 +7,11 @@
*/
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer, __local float4* local_force,
__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions, __global unsigned int* exclusionIndices,
__global unsigned int* exclusionRowIndices, __local float4* tempForceBuffer,
void computeN2Energy(__global float4* restrict forceBuffers, __global float* restrict energyBuffer, __local float4* restrict local_force,
__global const float4* restrict posq, __local float4* restrict local_posq, __global const unsigned int* restrict exclusions, __global const unsigned int* restrict exclusionIndices,
__global const unsigned int* restrict exclusionRowIndices, __local float4* restrict tempForceBuffer,
#ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles
#else
unsigned int numTiles
#endif
......
......@@ -14,15 +14,15 @@
*/
__kernel void computeN2Energy(
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* forceBuffers,
__global long* restrict forceBuffers,
#else
__global float4* forceBuffers,
__global float4* restrict forceBuffers,
#endif
__global float* energyBuffer, __local float4* local_force,
__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions, __global unsigned int* exclusionIndices,
__global unsigned int* exclusionRowIndices, __local float4* tempBuffer,
__global float* restrict energyBuffer, __local float4* restrict local_force,
__global const float4* restrict posq, __local float4* restrict local_posq, __global const unsigned int* restrict exclusions, __global const unsigned int* restrict exclusionIndices,
__global const unsigned int* restrict exclusionRowIndices, __local float4* restrict tempBuffer,
#ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
#else
unsigned int numTiles
#endif
......
......@@ -8,7 +8,7 @@
* Reduce the derivatives computed in the N^2 energy kernel, and compute all per-particle energy terms.
*/
__kernel void computePerParticleEnergy(int bufferSize, int numBuffers, __global float4* forceBuffers, __global float* energyBuffer, __global float4* posq
__kernel void computePerParticleEnergy(int bufferSize, int numBuffers, __global float4* restrict forceBuffers, __global float* restrict energyBuffer, __global const float4* restrict posq
PARAMETER_ARGUMENTS) {
float energy = 0.0f;
unsigned int index = get_global_id(0);
......
......@@ -2,7 +2,7 @@
* Compute chain rule terms for computed values that depend explicitly on particle coordinates.
*/
__kernel void computeGradientChainRuleTerms(__global float4* forceBuffers, __global float4* posq
__kernel void computeGradientChainRuleTerms(__global float4* restrict forceBuffers, __global const float4* restrict posq
PARAMETER_ARGUMENTS) {
unsigned int index = get_global_id(0);
while (index < NUM_ATOMS) {
......
......@@ -4,11 +4,11 @@
* Compute a value based on pair interactions.
*/
__kernel void computeN2Value(__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions,
__global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __global float* global_value, __local float* local_value,
__local float* tempBuffer,
__kernel void computeN2Value(__global const float4* restrict posq, __local float4* restrict local_posq, __global const unsigned int* restrict exclusions,
__global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __global float* restrict global_value, __local float* restrict local_value,
__local float* restrict tempBuffer,
#ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
#else
unsigned int numTiles
#endif
......
......@@ -5,11 +5,11 @@
*/
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeN2Value(__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions,
__global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __global float* global_value, __local float* local_value,
__local float* tempBuffer,
void computeN2Value(__global const float4* restrict posq, __local float4* restrict local_posq, __global const unsigned int* restrict exclusions,
__global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __global float* restrict global_value, __local float* restrict local_value,
__local float* restrict tempBuffer,
#ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles
#else
unsigned int numTiles
#endif
......
......@@ -7,16 +7,16 @@
/**
* Compute a value based on pair interactions.
*/
__kernel void computeN2Value(__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions,
__global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices,
__kernel void computeN2Value(__global const float4* restrict posq, __local float4* restrict local_posq, __global const unsigned int* restrict exclusions,
__global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices,
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* global_value,
__global long* restrict global_value,
#else
__global float* global_value,
__global float* restrict global_value,
#endif
__local float* local_value, __local float* tempBuffer,
__local float* restrict local_value, __local float* restrict tempBuffer,
#ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
#else
unsigned int numTiles
#endif
......
......@@ -55,8 +55,8 @@ float4 computeCross(float4 vec1, float4 vec2) {
/**
* Compute forces on donors.
*/
__kernel void computeDonorForces(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global int4* exclusions,
__global int4* donorAtoms, __global int4* acceptorAtoms, __global int4* donorBufferIndices, __local float4* posBuffer, float4 periodicBoxSize, float4 invPeriodicBoxSize
__kernel void computeDonorForces(__global float4* restrict forceBuffers, __global float* restrict energyBuffer, __global const float4* restrict posq, __global const int4* restrict exclusions,
__global const int4* restrict donorAtoms, __global const int4* restrict acceptorAtoms, __global const int4* restrict donorBufferIndices, __local float4* posBuffer, float4 periodicBoxSize, float4 invPeriodicBoxSize
PARAMETER_ARGUMENTS) {
float energy = 0.0f;
float4 f1 = (float4) 0;
......@@ -141,8 +141,8 @@ __kernel void computeDonorForces(__global float4* forceBuffers, __global float*
/**
* Compute forces on acceptors.
*/
__kernel void computeAcceptorForces(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global int4* exclusions,
__global int4* donorAtoms, __global int4* acceptorAtoms, __global int4* acceptorBufferIndices, __local float4* posBuffer, float4 periodicBoxSize, float4 invPeriodicBoxSize
__kernel void computeAcceptorForces(__global float4* restrict forceBuffers, __global float* restrict energyBuffer, __global const float4* restrict posq, __global const int4* restrict exclusions,
__global const int4* restrict donorAtoms, __global const int4* restrict acceptorAtoms, __global const int4* restrict acceptorBufferIndices, __local float4* restrict posBuffer, float4 periodicBoxSize, float4 invPeriodicBoxSize
PARAMETER_ARGUMENTS) {
float4 f1 = (float4) 0;
float4 f2 = (float4) 0;
......
......@@ -7,7 +7,7 @@ float2 multofFloat2(float2 a, float2 b) {
* Precompute the cosine and sine sums which appear in each force term.
*/
__kernel void calculateEwaldCosSinSums(__global float* energyBuffer, __global float4* posq, __global float2* cosSinSum, float4 reciprocalPeriodicBoxSize, float reciprocalCoefficient) {
__kernel void calculateEwaldCosSinSums(__global float* restrict energyBuffer, __global const float4* restrict posq, __global float2* restrict cosSinSum, float4 reciprocalPeriodicBoxSize, float reciprocalCoefficient) {
const unsigned int ksizex = 2*KMAX_X-1;
const unsigned int ksizey = 2*KMAX_Y-1;
const unsigned int ksizez = 2*KMAX_Z-1;
......@@ -58,7 +58,7 @@ __kernel void calculateEwaldCosSinSums(__global float* energyBuffer, __global fl
* previous routine.
*/
__kernel void calculateEwaldForces(__global float4* forceBuffers, __global float4* posq, __global float2* cosSinSum, float4 reciprocalPeriodicBoxSize, float reciprocalCoefficient) {
__kernel void calculateEwaldForces(__global float4* restrict forceBuffers, __global const float4* restrict posq, __global const float2* restrict cosSinSum, float4 reciprocalPeriodicBoxSize, float reciprocalCoefficient) {
unsigned int atom = get_global_id(0);
while (atom < NUM_ATOMS) {
float4 force = forceBuffers[atom];
......
......@@ -6,7 +6,8 @@ float2 multiplyComplex(float2 c1, float2 c2) {
* Perform a 1D FFT on each row along one axis.
*/
__kernel void execFFT(__global float2* in, __global float2* out, float sign, __local float2* w, __local float2* data0, __local float2* data1) {
__kernel void execFFT(__global const float2* restrict in, __global float2* restrict out, float sign, __local float2* restrict w,
__local float2* restrict data0, __local float2* restrict data1) {
for (int i = get_local_id(0); i < ZSIZE; i += get_local_size(0))
w[i] = (float2) (cos(-sign*i*2*M_PI/ZSIZE), sin(-sign*i*2*M_PI/ZSIZE));
barrier(CLK_LOCAL_MEM_FENCE);
......
......@@ -8,7 +8,7 @@
/**
* Find a bounding box for the atoms in each block.
*/
__kernel void findBlockBounds(int numAtoms, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global float4* posq, __global float4* blockCenter, __global float4* blockBoundingBox, __global unsigned int* interactionCount) {
__kernel void findBlockBounds(int numAtoms, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global const float4* restrict posq, __global float4* restrict blockCenter, __global float4* restrict blockBoundingBox, __global unsigned int* restrict interactionCount) {
int index = get_global_id(0);
int base = index*TILE_SIZE;
while (base < numAtoms) {
......@@ -47,7 +47,7 @@ __kernel void findBlockBounds(int numAtoms, float4 periodicBoxSize, float4 invPe
*/
void storeInteractionData(__local ushort2* buffer, __local int* valid, __local short* sum, __local ushort2* temp, __local int* baseIndex,
__global unsigned int* interactionCount, __global ushort2* interactingTiles, float cutoffSquared, float4 periodicBoxSize,
float4 invPeriodicBoxSize, __global float4* posq, __global float4* blockCenter, __global float4* blockBoundingBox, unsigned int maxTiles) {
float4 invPeriodicBoxSize, __global const float4* posq, __global const float4* blockCenter, __global const float4* blockBoundingBox, unsigned int maxTiles) {
// The buffer is full, so we need to compact it and write out results. Start by doing a parallel prefix sum.
for (int i = get_local_id(0); i < BUFFER_SIZE; i += GROUP_SIZE)
......@@ -144,9 +144,9 @@ void storeInteractionData(__local ushort2* buffer, __local int* valid, __local s
* Compare the bounding boxes for each pair of blocks. If they are sufficiently far apart,
* mark them as non-interacting.
*/
__kernel void findBlocksWithInteractions(float cutoffSquared, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global float4* blockCenter,
__global float4* blockBoundingBox, __global unsigned int* interactionCount, __global ushort2* interactingTiles,
__global unsigned int* interactionFlags, __global float4* posq, unsigned int maxTiles, unsigned int startTileIndex,
__kernel void findBlocksWithInteractions(float cutoffSquared, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global const float4* restrict blockCenter,
__global const float4* restrict blockBoundingBox, __global unsigned int* restrict interactionCount, __global ushort2* restrict interactingTiles,
__global unsigned int* restrict interactionFlags, __global const float4* restrict posq, unsigned int maxTiles, unsigned int startTileIndex,
unsigned int endTileIndex) {
__local ushort2 buffer[BUFFER_SIZE];
__local int valid[BUFFER_SIZE];
......@@ -220,8 +220,8 @@ __kernel void findBlocksWithInteractions(float cutoffSquared, float4 periodicBox
* Compare each atom in one block to the bounding box of another block, and set
* flags for which ones are interacting.
*/
__kernel void findInteractionsWithinBlocks(float cutoffSquared, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global float4* posq, __global ushort2* tiles, __global float4* blockCenter,
__global float4* blockBoundingBox, __global unsigned int* interactionFlags, __global unsigned int* interactionCount, __local unsigned int* flags, unsigned int maxTiles) {
__kernel void findInteractionsWithinBlocks(float cutoffSquared, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global const float4* restrict posq, __global const ushort2* restrict tiles, __global const float4* restrict blockCenter,
__global const float4* restrict blockBoundingBox, __global unsigned int* restrict interactionFlags, __global const unsigned int* restrict interactionCount, __local unsigned int* restrict flags, unsigned int maxTiles) {
unsigned int totalWarps = get_global_size(0)/TILE_SIZE;
unsigned int warp = get_global_id(0)/TILE_SIZE;
unsigned int numTiles = interactionCount[0];
......
......@@ -8,7 +8,7 @@
/**
* Find a bounding box for the atoms in each block.
*/
__kernel void findBlockBounds(int numAtoms, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global float4* posq, __global float4* blockCenter, __global float4* blockBoundingBox, __global unsigned int* interactionCount) {
__kernel void findBlockBounds(int numAtoms, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global const float4* restrict posq, __global float4* restrict blockCenter, __global float4* restrict blockBoundingBox, __global unsigned int* restrict interactionCount) {
int index = get_global_id(0);
int base = index*TILE_SIZE;
while (base < numAtoms) {
......@@ -121,9 +121,9 @@ void storeInteractionData(ushort2* buffer, int numValid, __global unsigned int*
* Compare the bounding boxes for each pair of blocks. If they are sufficiently far apart,
* mark them as non-interacting.
*/
__kernel void findBlocksWithInteractions(float cutoffSquared, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global float4* blockCenter,
__global float4* blockBoundingBox, __global unsigned int* interactionCount, __global ushort2* interactingTiles,
__global unsigned int* interactionFlags, __global float4* posq, unsigned int maxTiles, unsigned int startTileIndex,
__kernel void findBlocksWithInteractions(float cutoffSquared, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global const float4* restrict blockCenter,
__global const float4* restrict blockBoundingBox, __global unsigned int* restrict interactionCount, __global ushort2* restrict interactingTiles,
__global unsigned int* restrict interactionFlags, __global const float4* restrict posq, unsigned int maxTiles, unsigned int startTileIndex,
unsigned int endTileIndex) {
ushort2 buffer[BUFFER_SIZE];
int valuesInBuffer = 0;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment