Commit 59f8cf87 authored by Peter Eastman's avatar Peter Eastman
Browse files

Finished implementing double precision for OpenCL

parent c8dac206
......@@ -575,11 +575,11 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
// Build the list of virtual sites.
vector<mm_int4> vsite2AvgAtomVec;
vector<mm_float2> vsite2AvgWeightVec;
vector<mm_double2> vsite2AvgWeightVec;
vector<mm_int4> vsite3AvgAtomVec;
vector<mm_float4> vsite3AvgWeightVec;
vector<mm_double4> vsite3AvgWeightVec;
vector<mm_int4> vsiteOutOfPlaneAtomVec;
vector<mm_float4> vsiteOutOfPlaneWeightVec;
vector<mm_double4> vsiteOutOfPlaneWeightVec;
for (int i = 0; i < numAtoms; i++) {
if (system.isVirtualSite(i)) {
if (dynamic_cast<const TwoParticleAverageSite*>(&system.getVirtualSite(i)) != NULL) {
......@@ -587,21 +587,21 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
const TwoParticleAverageSite& site = dynamic_cast<const TwoParticleAverageSite&>(system.getVirtualSite(i));
vsite2AvgAtomVec.push_back(mm_int4(i, site.getParticle(0), site.getParticle(1), 0));
vsite2AvgWeightVec.push_back(mm_float2((float) site.getWeight(0), (float) site.getWeight(1)));
vsite2AvgWeightVec.push_back(mm_double2(site.getWeight(0), site.getWeight(1)));
}
else if (dynamic_cast<const ThreeParticleAverageSite*>(&system.getVirtualSite(i)) != NULL) {
// A three particle average.
const ThreeParticleAverageSite& site = dynamic_cast<const ThreeParticleAverageSite&>(system.getVirtualSite(i));
vsite3AvgAtomVec.push_back(mm_int4(i, site.getParticle(0), site.getParticle(1), site.getParticle(2)));
vsite3AvgWeightVec.push_back(mm_float4((float) site.getWeight(0), (float) site.getWeight(1), (float) site.getWeight(2), 0.0f));
vsite3AvgWeightVec.push_back(mm_double4(site.getWeight(0), site.getWeight(1), site.getWeight(2), 0.0));
}
else if (dynamic_cast<const OutOfPlaneSite*>(&system.getVirtualSite(i)) != NULL) {
// An out of plane site.
const OutOfPlaneSite& site = dynamic_cast<const OutOfPlaneSite&>(system.getVirtualSite(i));
vsiteOutOfPlaneAtomVec.push_back(mm_int4(i, site.getParticle(0), site.getParticle(1), site.getParticle(2)));
vsiteOutOfPlaneWeightVec.push_back(mm_float4((float) site.getWeight12(), (float) site.getWeight13(), (float) site.getWeightCross(), 0.0f));
vsiteOutOfPlaneWeightVec.push_back(mm_double4(site.getWeight12(), site.getWeight13(), site.getWeightCross(), 0.0));
}
}
}
......@@ -609,22 +609,47 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
int num3Avg = vsite3AvgAtomVec.size();
int numOutOfPlane = vsiteOutOfPlaneAtomVec.size();
vsite2AvgAtoms = OpenCLArray::create<mm_int4>(context, max(1, num2Avg), "vsite2AvgAtoms");
vsite2AvgWeights = OpenCLArray::create<mm_float2>(context, max(1, num2Avg), "vsite2AvgWeights");
vsite3AvgAtoms = OpenCLArray::create<mm_int4>(context, max(1, num3Avg), "vsite3AvgAtoms");
vsite3AvgWeights = OpenCLArray::create<mm_float4>(context, max(1, num3Avg), "vsite3AvgWeights");
vsiteOutOfPlaneAtoms = OpenCLArray::create<mm_int4>(context, max(1, numOutOfPlane), "vsiteOutOfPlaneAtoms");
vsiteOutOfPlaneWeights = OpenCLArray::create<mm_float4>(context, max(1, numOutOfPlane), "vsiteOutOfPlaneWeights");
if (num2Avg > 0) {
if (num2Avg > 0)
vsite2AvgAtoms->upload(vsite2AvgAtomVec);
if (num3Avg > 0)
vsite3AvgAtoms->upload(vsite3AvgAtomVec);
if (numOutOfPlane > 0)
vsiteOutOfPlaneAtoms->upload(vsiteOutOfPlaneAtomVec);
if (context.getUseDoublePrecision()) {
vsite2AvgWeights = OpenCLArray::create<mm_double2>(context, max(1, num2Avg), "vsite2AvgWeights");
vsite3AvgWeights = OpenCLArray::create<mm_double4>(context, max(1, num3Avg), "vsite3AvgWeights");
vsiteOutOfPlaneWeights = OpenCLArray::create<mm_double4>(context, max(1, numOutOfPlane), "vsiteOutOfPlaneWeights");
if (num2Avg > 0)
vsite2AvgWeights->upload(vsite2AvgWeightVec);
if (num3Avg > 0)
vsite3AvgWeights->upload(vsite3AvgWeightVec);
if (numOutOfPlane > 0)
vsiteOutOfPlaneWeights->upload(vsiteOutOfPlaneWeightVec);
}
else {
vsite2AvgWeights = OpenCLArray::create<mm_float2>(context, max(1, num2Avg), "vsite2AvgWeights");
vsite3AvgWeights = OpenCLArray::create<mm_float4>(context, max(1, num3Avg), "vsite3AvgWeights");
vsiteOutOfPlaneWeights = OpenCLArray::create<mm_float4>(context, max(1, numOutOfPlane), "vsiteOutOfPlaneWeights");
if (num2Avg > 0) {
vector<mm_float2> floatWeights(num2Avg);
for (int i = 0; i < num2Avg; i++)
floatWeights[i] = mm_float2((float) vsite2AvgWeightVec[i].x, (float) vsite2AvgWeightVec[i].y);
vsite2AvgWeights->upload(floatWeights);
}
if (num3Avg > 0) {
vsite3AvgAtoms->upload(vsite3AvgAtomVec);
vsite3AvgWeights->upload(vsite3AvgWeightVec);
vector<mm_float4> floatWeights(num3Avg);
for (int i = 0; i < num3Avg; i++)
floatWeights[i] = mm_float4((float) vsite3AvgWeightVec[i].x, (float) vsite3AvgWeightVec[i].y, (float) vsite3AvgWeightVec[i].z, 0.0f);
vsite3AvgWeights->upload(floatWeights);
}
if (numOutOfPlane > 0) {
vsiteOutOfPlaneAtoms->upload(vsiteOutOfPlaneAtomVec);
vsiteOutOfPlaneWeights->upload(vsiteOutOfPlaneWeightVec);
vector<mm_float4> floatWeights(numOutOfPlane);
for (int i = 0; i < numOutOfPlane; i++)
floatWeights[i] = mm_float4((float) vsiteOutOfPlaneWeightVec[i].x, (float) vsiteOutOfPlaneWeightVec[i].y, (float) vsiteOutOfPlaneWeightVec[i].z, 0.0f);
vsiteOutOfPlaneWeights->upload(floatWeights);
}
}
// Create the kernels for virtual sites.
......
......@@ -2006,33 +2006,40 @@ void OpenCLCalcGBSAOBCForceKernel::initialize(const System& system, const GBSAOB
throw OpenMMException("GBSAOBCForce does not support using multiple OpenCL devices");
OpenCLNonbondedUtilities& nb = cl.getNonbondedUtilities();
params = OpenCLArray::create<mm_float2>(cl, cl.getPaddedNumAtoms(), "gbsaObcParams");
bornRadii = OpenCLArray::create<cl_float>(cl, cl.getPaddedNumAtoms(), "bornRadii");
obcChain = OpenCLArray::create<cl_float>(cl, cl.getPaddedNumAtoms(), "obcChain");
int elementSize = (cl.getUseDoublePrecision() ? sizeof(cl_double) : sizeof(cl_float));
bornRadii = new OpenCLArray(cl, cl.getPaddedNumAtoms(), elementSize, "bornRadii");
obcChain = new OpenCLArray(cl, cl.getPaddedNumAtoms(), elementSize, "obcChain");
if (cl.getSupports64BitGlobalAtomics()) {
longBornSum = OpenCLArray::create<cl_long>(cl, cl.getPaddedNumAtoms(), "longBornSum");
longBornForce = OpenCLArray::create<cl_long>(cl, cl.getPaddedNumAtoms(), "longBornForce");
bornForce = OpenCLArray::create<cl_float>(cl, cl.getPaddedNumAtoms(), "bornForce");
bornForce = new OpenCLArray(cl, cl.getPaddedNumAtoms(), elementSize, "bornForce");
cl.addAutoclearBuffer(*longBornSum);
cl.addAutoclearBuffer(*longBornForce);
}
else {
bornSum = OpenCLArray::create<cl_float>(cl, cl.getPaddedNumAtoms()*nb.getNumForceBuffers(), "bornSum");
bornForce = OpenCLArray::create<cl_float>(cl, cl.getPaddedNumAtoms()*nb.getNumForceBuffers(), "bornForce");
bornSum = new OpenCLArray(cl, cl.getPaddedNumAtoms()*nb.getNumForceBuffers(), elementSize, "bornSum");
bornForce = new OpenCLArray(cl, cl.getPaddedNumAtoms()*nb.getNumForceBuffers(), elementSize, "bornForce");
cl.addAutoclearBuffer(*bornSum);
cl.addAutoclearBuffer(*bornForce);
}
vector<mm_float4> posq(cl.getPaddedNumAtoms(), mm_float4(0, 0, 0, 0));
int numParticles = force.getNumParticles();
vector<mm_float4> posqf(cl.getPaddedNumAtoms());
vector<mm_double4> posqd(cl.getPaddedNumAtoms());
vector<mm_float2> paramsVector(cl.getPaddedNumAtoms());
const double dielectricOffset = 0.009;
for (int i = 0; i < numParticles; i++) {
for (int i = 0; i < force.getNumParticles(); i++) {
double charge, radius, scalingFactor;
force.getParticleParameters(i, charge, radius, scalingFactor);
radius -= dielectricOffset;
paramsVector[i] = mm_float2((float) radius, (float) (scalingFactor*radius));
posq[i].w = (float) charge;
if (cl.getUseDoublePrecision())
posqd[i] = mm_double4(0, 0, 0, charge);
else
posqf[i] = mm_float4(0, 0, 0, (float) charge);
}
cl.getPosq().upload(posq);
if (cl.getUseDoublePrecision())
cl.getPosq().upload(posqd);
else
cl.getPosq().upload(posqf);
params->upload(paramsVector);
prefactor = -ONE_4PI_EPS0*((1.0/force.getSoluteDielectric())-(1.0/force.getSolventDielectric()));
bool useCutoff = (force.getNonbondedMethod() != GBSAOBCForce::NoCutoff);
......@@ -2040,7 +2047,7 @@ void OpenCLCalcGBSAOBCForceKernel::initialize(const System& system, const GBSAOB
string source = OpenCLKernelSources::gbsaObc2;
nb.addInteraction(useCutoff, usePeriodic, false, force.getCutoffDistance(), vector<vector<int> >(), source, force.getForceGroup());
nb.addParameter(OpenCLNonbondedUtilities::ParameterInfo("obcParams", "float", 2, sizeof(cl_float2), params->getDeviceBuffer()));;
nb.addParameter(OpenCLNonbondedUtilities::ParameterInfo("bornForce", "float", 1, sizeof(cl_float), bornForce->getDeviceBuffer()));;
nb.addParameter(OpenCLNonbondedUtilities::ParameterInfo("bornForce", "real", 1, elementSize, bornForce->getDeviceBuffer()));;
cl.addForce(new OpenCLGBSAOBCForceInfo(nb.getNumForceBuffers(), force));
}
......@@ -2141,10 +2148,10 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
reduceBornForceKernel.setArg<cl::Buffer>(index++, obcChain->getDeviceBuffer());
}
if (nb.getUseCutoff()) {
computeBornSumKernel.setArg<mm_float4>(5, cl.getPeriodicBoxSize());
computeBornSumKernel.setArg<mm_float4>(6, cl.getInvPeriodicBoxSize());
force1Kernel.setArg<mm_float4>(7, cl.getPeriodicBoxSize());
force1Kernel.setArg<mm_float4>(8, cl.getInvPeriodicBoxSize());
setPeriodicBoxSizeArg(cl, computeBornSumKernel, 5);
setInvPeriodicBoxSizeArg(cl, computeBornSumKernel, 6);
setPeriodicBoxSizeArg(cl, force1Kernel, 7);
setInvPeriodicBoxSizeArg(cl, force1Kernel, 8);
if (maxTiles < nb.getInteractingTiles().getSize()) {
maxTiles = nb.getInteractingTiles().getSize();
computeBornSumKernel.setArg<cl::Buffer>(3, nb.getInteractingTiles().getDeviceBuffer());
......@@ -2174,8 +2181,9 @@ void OpenCLCalcGBSAOBCForceKernel::copyParametersToContext(ContextImpl& context,
// Record the per-particle parameters.
OpenCLArray& posq = cl.getPosq();
posq.download((mm_float4*) cl.getPinnedBuffer());
mm_float4* posqf = (mm_float4*) cl.getPinnedBuffer();
mm_double4* posqd = (mm_double4*) cl.getPinnedBuffer();
posq.download(cl.getPinnedBuffer());
vector<mm_float2> paramsVector(cl.getPaddedNumAtoms());
const double dielectricOffset = 0.009;
for (int i = 0; i < numParticles; i++) {
......@@ -2183,6 +2191,9 @@ void OpenCLCalcGBSAOBCForceKernel::copyParametersToContext(ContextImpl& context,
force.getParticleParameters(i, charge, radius, scalingFactor);
radius -= dielectricOffset;
paramsVector[i] = mm_float2((float) radius, (float) (scalingFactor*radius));
if (cl.getUseDoublePrecision())
posqd[i].w = charge;
else
posqf[i].w = (float) charge;
}
posq.upload(cl.getPinnedBuffer());
......@@ -2597,14 +2608,14 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
extraArgs << ", __global long* restrict derivBuffers";
for (int i = 0; i < force.getNumComputedValues(); i++) {
string index = cl.intToString(i+1);
extraArgs << ", __local float* restrict local_deriv" << index;
extraArgs << ", __local real* restrict local_deriv" << index;
clearLocal << "local_deriv" << index << "[localAtomIndex] = 0.0f;\n";
declare1 << "float deriv" << index << "_1 = 0.0f;\n";
load2 << "float deriv" << index << "_2 = 0.0f;\n";
declare1 << "real deriv" << index << "_1 = 0;\n";
load2 << "real deriv" << index << "_2 = 0;\n";
recordDeriv << "local_deriv" << index << "[atom2] += deriv" << index << "_2;\n";
storeDerivs1 << "STORE_DERIVATIVE_1(" << index << ")\n";
storeDerivs2 << "STORE_DERIVATIVE_2(" << index << ")\n";
declareTemps << "__local float tempDerivBuffer" << index << "[64];\n";
declareTemps << "__local real tempDerivBuffer" << index << "[64];\n";
setTemps << "tempDerivBuffer" << index << "[get_local_id(0)] = deriv" << index << "_1;\n";
}
}
......@@ -2728,14 +2739,14 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
}
for (int i = 1; i < force.getNumComputedValues(); i++)
for (int j = 0; j < i; j++)
expressions["float dV"+cl.intToString(i)+"dV"+cl.intToString(j)+" = "] = valueDerivExpressions[i][j];
expressions["real dV"+cl.intToString(i)+"dV"+cl.intToString(j)+" = "] = valueDerivExpressions[i][j];
compute << cl.getExpressionUtilities().createExpressions(expressions, variables, functionDefinitions, "temp", prefix+"functionParams");
// Record values.
compute << "forceBuffers[index] = forceBuffers[index]+force;\n";
for (int i = 1; i < force.getNumComputedValues(); i++) {
compute << "float totalDeriv"<<i<<" = dV"<<i<<"dV0";
compute << "real totalDeriv"<<i<<" = dV"<<i<<"dV0";
for (int j = 1; j < i; j++)
compute << " + totalDeriv"<<j<<"*dV"<<i<<"dV"<<j;
compute << ";\n";
......@@ -2789,12 +2800,12 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
variables[computedValueNames[i]] = "values"+computedValues->getParameterSuffix(i, "[index]");
for (int i = 1; i < force.getNumComputedValues(); i++) {
string is = cl.intToString(i);
compute << "float4 dV"<<is<<"dR = (float4) 0;\n";
compute << "real4 dV"<<is<<"dR = (real4) 0;\n";
for (int j = 1; j < i; j++) {
if (!isZeroExpression(valueDerivExpressions[i][j])) {
map<string, Lepton::ParsedExpression> derivExpressions;
string js = cl.intToString(j);
derivExpressions["float dV"+is+"dV"+js+" = "] = valueDerivExpressions[i][j];
derivExpressions["real dV"+is+"dV"+js+" = "] = valueDerivExpressions[i][j];
compute << cl.getExpressionUtilities().createExpressions(derivExpressions, variables, functionDefinitions, "temp_"+is+"_"+js, prefix+"functionParams");
compute << "dV"<<is<<"dR += dV"<<is<<"dV"<<js<<"*dV"<<js<<"dR;\n";
}
......@@ -2845,8 +2856,8 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
map<string, Lepton::ParsedExpression> derivExpressions;
stringstream chainSource;
Lepton::ParsedExpression dVdR = Lepton::Parser::parse(computedValueExpressions[0], functions).differentiate("r").optimize();
derivExpressions["float dV0dR1 = "] = dVdR;
derivExpressions["float dV0dR2 = "] = dVdR.renameVariables(rename);
derivExpressions["real dV0dR1 = "] = dVdR;
derivExpressions["real dV0dR2 = "] = dVdR.renameVariables(rename);
chainSource << cl.getExpressionUtilities().createExpressions(derivExpressions, variables, functionDefinitions, prefix+"temp0_", prefix+"functionParams");
if (needChainForValue[0]) {
if (useExclusionsForValue)
......@@ -2911,6 +2922,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
bool deviceIsCpu = (cl.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
OpenCLNonbondedUtilities& nb = cl.getNonbondedUtilities();
int elementSize = (cl.getUseDoublePrecision() ? sizeof(cl_double) : sizeof(cl_float));
if (!hasInitializedKernels) {
hasInitializedKernels = true;
maxTiles = (nb.getUseCutoff() ? nb.getInteractingTiles().getSize() : 0);
......@@ -2921,21 +2933,21 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
cl.clearBuffer(*longValueBuffers);
}
else {
valueBuffers = OpenCLArray::create<cl_float>(cl, cl.getPaddedNumAtoms()*nb.getNumForceBuffers(), "customGBValueBuffers");
valueBuffers = new OpenCLArray(cl, cl.getPaddedNumAtoms()*nb.getNumForceBuffers(), elementSize, "customGBValueBuffers");
cl.addAutoclearBuffer(*valueBuffers);
cl.clearBuffer(*valueBuffers);
}
int index = 0;
pairValueKernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer());
pairValueKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*sizeof(cl_float4), NULL);
pairValueKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*4*elementSize, NULL);
pairValueKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusions().getDeviceBuffer());
pairValueKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionIndices().getDeviceBuffer());
pairValueKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionRowIndices().getDeviceBuffer());
pairValueKernel.setArg<cl::Buffer>(index++, useLong ? longValueBuffers->getDeviceBuffer() : valueBuffers->getDeviceBuffer());
pairValueKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*sizeof(cl_float), NULL);
pairValueKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*elementSize, NULL);
/// \todo Eliminate this argument and make local to the kernel. For *_default.cl kernel can actually make it TileSize rather than getForceThreadBlockSize as only half the workgroup stores to it as was done with nonbonded_default.cl.
/// \todo Also make the previous __local argument local as was done with nonbonded_default.cl.
pairValueKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*sizeof(cl_float), NULL);
pairValueKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*elementSize, NULL);
if (nb.getUseCutoff()) {
pairValueKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
pairValueKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
......@@ -2979,9 +2991,9 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
index = 0;
pairEnergyKernel.setArg<cl::Buffer>(index++, useLong ? cl.getLongForceBuffer().getDeviceBuffer() : cl.getForceBuffers().getDeviceBuffer());
pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getEnergyBuffer().getDeviceBuffer());
pairEnergyKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*sizeof(cl_float4), NULL);
pairEnergyKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*4*elementSize, NULL);
pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer());
pairEnergyKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*sizeof(cl_float4), NULL);
pairEnergyKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*4*elementSize, NULL);
pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusions().getDeviceBuffer());
pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionIndices().getDeviceBuffer());
pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionRowIndices().getDeviceBuffer());
......@@ -3017,7 +3029,7 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
if (useLong) {
pairEnergyKernel.setArg<cl::Memory>(index++, longEnergyDerivs->getDeviceBuffer());
for (int i = 0; i < numComputedValues; ++i)
pairEnergyKernel.setArg(index++, nb.getForceThreadBlockSize()*sizeof(cl_float), NULL);
pairEnergyKernel.setArg(index++, nb.getForceThreadBlockSize()*elementSize, NULL);
}
else {
for (int i = 0; i < (int) energyDerivs->getBuffers().size(); i++) {
......@@ -3078,10 +3090,10 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
globals->upload(globalParamValues);
}
if (nb.getUseCutoff()) {
pairValueKernel.setArg<mm_float4>(10, cl.getPeriodicBoxSize());
pairValueKernel.setArg<mm_float4>(11, cl.getInvPeriodicBoxSize());
pairEnergyKernel.setArg<mm_float4>(11, cl.getPeriodicBoxSize());
pairEnergyKernel.setArg<mm_float4>(12, cl.getInvPeriodicBoxSize());
setPeriodicBoxSizeArg(cl, pairValueKernel, 10);
setInvPeriodicBoxSizeArg(cl, pairValueKernel, 11);
setPeriodicBoxSizeArg(cl, pairEnergyKernel, 11);
setInvPeriodicBoxSizeArg(cl, pairEnergyKernel, 12);
if (maxTiles < nb.getInteractingTiles().getSize()) {
maxTiles = nb.getInteractingTiles().getSize();
pairValueKernel.setArg<cl::Buffer>(8, nb.getInteractingTiles().getDeviceBuffer());
......@@ -5340,8 +5352,8 @@ void OpenCLApplyMonteCarloBarostatKernel::scaleCoordinates(ContextImpl& context,
}
cl.getQueue().enqueueCopyBuffer(cl.getPosq().getDeviceBuffer(), savedPositions->getDeviceBuffer(), 0, 0, cl.getPosq().getSize()*sizeof(mm_float4));
kernel.setArg<cl_float>(0, (cl_float) scale);
kernel.setArg<mm_float4>(2, cl.getPeriodicBoxSize());
kernel.setArg<mm_float4>(3, cl.getInvPeriodicBoxSize());
setPeriodicBoxSizeArg(cl, kernel, 2);
setInvPeriodicBoxSizeArg(cl, kernel, 3);
cl.executeKernel(kernel, cl.getNumAtoms());
for (int i = 0; i < (int) cl.getPosCellOffsets().size(); i++)
cl.getPosCellOffsets()[i] = mm_int4(0, 0, 0, 0);
......
......@@ -256,8 +256,9 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
interactingTiles = OpenCLArray::create<mm_ushort2>(context, maxInteractingTiles, "interactingTiles");
interactionFlags = OpenCLArray::create<cl_uint>(context, context.getSIMDWidth() == 32 ? maxInteractingTiles : (deviceIsCpu ? 2*maxInteractingTiles : 1), "interactionFlags");
interactionCount = OpenCLArray::create<cl_uint>(context, 1, "interactionCount");
blockCenter = OpenCLArray::create<mm_float4>(context, numAtomBlocks, "blockCenter");
blockBoundingBox = OpenCLArray::create<mm_float4>(context, numAtomBlocks, "blockBoundingBox");
int elementSize = (context.getUseDoublePrecision() ? sizeof(mm_double4) : sizeof(mm_float4));
blockCenter = new OpenCLArray(context, numAtomBlocks, elementSize, "blockCenter");
blockBoundingBox = new OpenCLArray(context, numAtomBlocks, elementSize, "blockBoundingBox");
vector<cl_uint> count(1, 0);
interactionCount->upload(count);
}
......
......@@ -4,10 +4,10 @@ if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2 && r2 < CUTOFF_SQUA
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
#endif
#ifdef USE_SYMMETRIC
float tempForce = 0.0f;
real tempForce = 0.0f;
#else
float4 tempForce1 = (float4) 0.0f;
float4 tempForce2 = (float4) 0.0f;
real4 tempForce1 = (real4) 0;
real4 tempForce2 = (real4) 0;
#endif
COMPUTE_FORCE
#ifdef USE_SYMMETRIC
......
......@@ -6,11 +6,11 @@
* Compute a force based on pair interactions.
*/
__kernel void computeN2Energy(__global float4* restrict forceBuffers, __global float* restrict energyBuffer, __local float4* restrict local_force,
__global const float4* restrict posq, __local float4* restrict local_posq, __global const unsigned int* restrict exclusions, __global const unsigned int* restrict exclusionIndices,
__global const unsigned int* restrict exclusionRowIndices, __local float4* restrict tempBuffer,
__kernel void computeN2Energy(__global real4* restrict forceBuffers, __global real* restrict energyBuffer, __local real4* restrict local_force,
__global const real4* restrict posq, __local real4* restrict local_posq, __global const unsigned int* restrict exclusions, __global const unsigned int* restrict exclusionIndices,
__global const unsigned int* restrict exclusionRowIndices, __local real4* restrict tempBuffer,
#ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
#else
unsigned int numTiles
#endif
......@@ -23,7 +23,7 @@ __kernel void computeN2Energy(__global float4* restrict forceBuffers, __global f
unsigned int pos = get_group_id(0)*numTiles/get_num_groups(0);
unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
#endif
float energy = 0.0f;
real energy = 0;
unsigned int lasty = 0xFFFFFFFF;
while (pos < end) {
......@@ -79,30 +79,30 @@ __kernel void computeN2Energy(__global float4* restrict forceBuffers, __global f
unsigned int excl = exclusions[exclusionIndex+tgx];
#endif
unsigned int atom1 = x*TILE_SIZE+tgx;
float4 force = 0.0f;
real4 force = 0;
DECLARE_ATOM1_DERIVATIVES
float4 posq1 = posq[atom1];
real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
for (unsigned int j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
float4 posq2 = local_posq[j];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
real4 posq2 = local_posq[j];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif
float r2 = dot(delta.xyz, delta.xyz);
real r2 = dot(delta.xyz, delta.xyz);
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
float invR = RSQRT(r2);
float r = RECIP(invR);
real invR = RSQRT(r2);
real r = RECIP(invR);
unsigned int atom2 = j;
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
float dEdR = 0.0f;
float tempEnergy = 0.0f;
real dEdR = 0;
real tempEnergy = 0;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
COMPUTE_INTERACTION
dEdR /= -r;
......@@ -129,7 +129,7 @@ __kernel void computeN2Energy(__global float4* restrict forceBuffers, __global f
// This is an off-diagonal tile.
for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
local_force[localAtomIndex] = 0.0f;
local_force[localAtomIndex] = 0;
CLEAR_LOCAL_DERIVATIVES
}
#if defined(USE_CUTOFF) && defined(USE_EXCLUSIONS)
......@@ -141,26 +141,26 @@ __kernel void computeN2Energy(__global float4* restrict forceBuffers, __global f
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
if ((flags2&(1<<tgx)) != 0) {
unsigned int atom1 = x*TILE_SIZE+tgx;
float value = 0.0f;
real value = 0;
DECLARE_ATOM1_DERIVATIVES
float4 posq1 = posq[atom1];
real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) {
float4 posq2 = local_posq[j];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
real4 posq2 = local_posq[j];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif
float r2 = dot(delta.xyz, delta.xyz);
real r2 = dot(delta.xyz, delta.xyz);
if (r2 < CUTOFF_SQUARED) {
float invR = RSQRT(r2);
float r = RECIP(invR);
real invR = RSQRT(r2);
real r = RECIP(invR);
unsigned int atom2 = j;
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
float dEdR = 0.0f;
float tempEnergy = 0.0f;
real dEdR = 0;
real tempEnergy = 0;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
COMPUTE_INTERACTION
dEdR /= -r;
......@@ -189,9 +189,9 @@ __kernel void computeN2Energy(__global float4* restrict forceBuffers, __global f
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int atom1 = x*TILE_SIZE+tgx;
float4 force = 0.0f;
real4 force = 0;
DECLARE_ATOM1_DERIVATIVES
float4 posq1 = posq[atom1];
real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
#ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndex+tgx] : 0xFFFFFFFF);
......@@ -200,22 +200,22 @@ __kernel void computeN2Energy(__global float4* restrict forceBuffers, __global f
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
float4 posq2 = local_posq[j];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
real4 posq2 = local_posq[j];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif
float r2 = dot(delta.xyz, delta.xyz);
real r2 = dot(delta.xyz, delta.xyz);
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
float invR = RSQRT(r2);
float r = RECIP(invR);
real invR = RSQRT(r2);
real r = RECIP(invR);
unsigned int atom2 = j;
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
float dEdR = 0.0f;
float tempEnergy = 0.0f;
real dEdR = 0;
real tempEnergy = 0;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
COMPUTE_INTERACTION
dEdR /= -r;
......
......@@ -17,13 +17,13 @@ void computeN2Energy(
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict forceBuffers,
#else
__global float4* restrict forceBuffers,
__global real4* restrict forceBuffers,
#endif
__global float* restrict energyBuffer, __local float4* restrict local_force,
__global const float4* restrict posq, __local float4* restrict local_posq, __global const unsigned int* restrict exclusions, __global const unsigned int* restrict exclusionIndices,
__global const unsigned int* restrict exclusionRowIndices, __local float4* restrict tempForceBuffer,
__global real* restrict energyBuffer, __local real4* restrict local_force,
__global const real4* restrict posq, __local real4* restrict local_posq, __global const unsigned int* restrict exclusions, __global const unsigned int* restrict exclusionIndices,
__global const unsigned int* restrict exclusionRowIndices, __local real4* restrict tempForceBuffer,
#ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles
#else
unsigned int numTiles
#endif
......@@ -36,7 +36,7 @@ void computeN2Energy(
unsigned int pos = get_group_id(0)*numTiles/get_num_groups(0);
unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
#endif
float energy = 0.0f;
real energy = 0;
unsigned int lasty = 0xFFFFFFFF;
__local unsigned int exclusionRange[2];
__local int exclusionIndex[1];
......@@ -65,9 +65,9 @@ void computeN2Energy(
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int forceBufferOffset = (tgx < TILE_SIZE/2 ? 0 : TILE_SIZE);
unsigned int atom1 = x*TILE_SIZE + tgx;
float4 force = 0.0f;
real4 force = 0;
DECLARE_ATOM1_DERIVATIVES
float4 posq1 = posq[atom1];
real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile.
......@@ -99,23 +99,23 @@ void computeN2Energy(
bool isExcluded = !(excl & 0x1);
#endif
int atom2 = baseLocalAtom+j;
float4 posq2 = local_posq[atom2];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
real4 posq2 = local_posq[atom2];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
float invR = RSQRT(r2);
float r = RECIP(invR);
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+baseLocalAtom+j;
float dEdR = 0.0f;
float tempEnergy = 0.0f;
real dEdR = 0;
real tempEnergy = 0;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
COMPUTE_INTERACTION
dEdR /= -r;
......@@ -164,7 +164,7 @@ void computeN2Energy(
local_posq[localAtomIndex] = posq[j];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
local_force[localAtomIndex] = 0.0f;
local_force[localAtomIndex] = 0;
CLEAR_LOCAL_DERIVATIVES
barrier(CLK_LOCAL_MEM_FENCE);
......@@ -182,23 +182,23 @@ void computeN2Energy(
bool isExcluded = !(excl & 0x1);
#endif
int atom2 = baseLocalAtom+tj;
float4 posq2 = local_posq[atom2];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
real4 posq2 = local_posq[atom2];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
float invR = RSQRT(r2);
float r = RECIP(invR);
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+baseLocalAtom+tj;
float dEdR = 0.0f;
float tempEnergy = 0.0f;
real dEdR = 0;
real tempEnergy = 0;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
COMPUTE_INTERACTION
dEdR /= -r;
......
......@@ -16,13 +16,13 @@ __kernel void computeN2Energy(
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict forceBuffers,
#else
__global float4* restrict forceBuffers,
__global real4* restrict forceBuffers,
#endif
__global float* restrict energyBuffer, __local float4* restrict local_force,
__global const float4* restrict posq, __local float4* restrict local_posq, __global const unsigned int* restrict exclusions, __global const unsigned int* restrict exclusionIndices,
__global const unsigned int* restrict exclusionRowIndices, __local float4* restrict tempBuffer,
__global real* restrict energyBuffer, __local real4* restrict local_force,
__global const real4* restrict posq, __local real4* restrict local_posq, __global const unsigned int* restrict exclusions, __global const unsigned int* restrict exclusionIndices,
__global const unsigned int* restrict exclusionRowIndices, __local real4* restrict tempBuffer,
#ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
#else
unsigned int numTiles
#endif
......@@ -37,7 +37,7 @@ __kernel void computeN2Energy(
unsigned int pos = warp*numTiles/totalWarps;
unsigned int end = (warp+1)*numTiles/totalWarps;
#endif
float energy = 0.0f;
real energy = 0;
unsigned int lasty = 0xFFFFFFFF;
__local unsigned int exclusionRange[2*WARPS_PER_GROUP];
__local int exclusionIndex[WARPS_PER_GROUP];
......@@ -49,7 +49,7 @@ __kernel void computeN2Energy(
const unsigned int tbx = get_local_id(0) - tgx;
const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
unsigned int x, y;
float4 force = 0.0f;
real4 force = 0;
DECLARE_ATOM1_DERIVATIVES
if (pos < end) {
#ifdef USE_CUTOFF
......@@ -69,7 +69,7 @@ __kernel void computeN2Energy(
}
}
unsigned int atom1 = x*TILE_SIZE + tgx;
float4 posq1 = posq[atom1];
real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile.
......@@ -102,23 +102,23 @@ __kernel void computeN2Energy(
bool isExcluded = !(excl & 0x1);
#endif
int atom2 = tbx+j;
float4 posq2 = local_posq[atom2];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
real4 posq2 = local_posq[atom2];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
float invR = RSQRT(r2);
float r = RECIP(invR);
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
float dEdR = 0.0f;
float tempEnergy = 0.0f;
real dEdR = 0;
real tempEnergy = 0;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
COMPUTE_INTERACTION
dEdR /= -r;
......@@ -143,7 +143,7 @@ __kernel void computeN2Energy(
local_posq[localAtomIndex] = posq[j];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
local_force[localAtomIndex] = 0.0f;
local_force[localAtomIndex] = 0;
CLEAR_LOCAL_DERIVATIVES
#ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
......@@ -165,23 +165,23 @@ __kernel void computeN2Energy(
bool isExcluded = !(excl & 0x1);
#endif
int atom2 = tbx+tj;
float4 posq2 = local_posq[atom2];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
real4 posq2 = local_posq[atom2];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
float invR = RSQRT(r2);
float r = RECIP(invR);
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+tj;
float dEdR = 0.0f;
float tempEnergy = 0.0f;
real dEdR = 0;
real tempEnergy = 0;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
COMPUTE_INTERACTION
dEdR /= -r;
......
......@@ -9,9 +9,9 @@
* Reduce the derivatives computed in the N^2 energy kernel, and compute all per-particle energy terms.
*/
__kernel void computePerParticleEnergy(int bufferSize, int numBuffers, __global float4* restrict forceBuffers, __global float* restrict energyBuffer, __global const float4* restrict posq
__kernel void computePerParticleEnergy(int bufferSize, int numBuffers, __global real4* restrict forceBuffers, __global real* restrict energyBuffer, __global const real4* restrict posq
PARAMETER_ARGUMENTS) {
float energy = 0.0f;
real energy = 0;
unsigned int index = get_global_id(0);
while (index < NUM_ATOMS) {
// Reduce the derivatives
......@@ -21,8 +21,8 @@ __kernel void computePerParticleEnergy(int bufferSize, int numBuffers, __global
// Now calculate the per-particle energy terms.
float4 pos = posq[index];
float4 force = (float4) 0.0f;
real4 pos = posq[index];
real4 force = (real4) 0;
COMPUTE_ENERGY
index += get_global_size(0);
}
......
......@@ -2,12 +2,12 @@
* Compute chain rule terms for computed values that depend explicitly on particle coordinates.
*/
__kernel void computeGradientChainRuleTerms(__global float4* restrict forceBuffers, __global const float4* restrict posq
__kernel void computeGradientChainRuleTerms(__global real4* restrict forceBuffers, __global const real4* restrict posq
PARAMETER_ARGUMENTS) {
unsigned int index = get_global_id(0);
while (index < NUM_ATOMS) {
float4 pos = posq[index];
float4 force = forceBuffers[index];
real4 pos = posq[index];
real4 force = forceBuffers[index];
COMPUTE_FORCES
forceBuffers[index] = force;
index += get_global_size(0);
......
......@@ -4,11 +4,11 @@
* Compute a value based on pair interactions.
*/
__kernel void computeN2Value(__global const float4* restrict posq, __local float4* restrict local_posq, __global const unsigned int* restrict exclusions,
__global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __global float* restrict global_value, __local float* restrict local_value,
__local float* restrict tempBuffer,
__kernel void computeN2Value(__global const real4* restrict posq, __local real4* restrict local_posq, __global const unsigned int* restrict exclusions,
__global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __global real* restrict global_value, __local real* restrict local_value,
__local real* restrict tempBuffer,
#ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
#else
unsigned int numTiles
#endif
......@@ -76,29 +76,29 @@ __kernel void computeN2Value(__global const float4* restrict posq, __local float
unsigned int excl = exclusions[exclusionIndex+tgx];
#endif
unsigned int atom1 = x*TILE_SIZE+tgx;
float value = 0.0f;
float4 posq1 = posq[atom1];
real value = 0;
real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
for (unsigned int j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
float4 posq2 = local_posq[j];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
real4 posq2 = local_posq[j];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif
float r2 = dot(delta.xyz, delta.xyz);
real r2 = dot(delta.xyz, delta.xyz);
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
float invR = RSQRT(r2);
float r = RECIP(invR);
real invR = RSQRT(r2);
real r = RECIP(invR);
unsigned int atom2 = j;
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
float tempValue1 = 0.0f;
float tempValue2 = 0.0f;
real tempValue1 = 0;
real tempValue2 = 0;
#ifdef USE_EXCLUSIONS
if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
#else
......@@ -125,7 +125,7 @@ __kernel void computeN2Value(__global const float4* restrict posq, __local float
// This is an off-diagonal tile.
for (int tgx = 0; tgx < TILE_SIZE; tgx++)
local_value[tgx] = 0.0f;
local_value[tgx] = 0;
#if defined(USE_CUTOFF) && defined(USE_EXCLUSIONS)
unsigned int flags1 = (numTiles <= maxTiles ? interactionFlags[2*pos] : 0xFFFFFFFF);
unsigned int flags2 = (numTiles <= maxTiles ? interactionFlags[2*pos+1] : 0xFFFFFFFF);
......@@ -135,22 +135,22 @@ __kernel void computeN2Value(__global const float4* restrict posq, __local float
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
if ((flags2&(1<<tgx)) != 0) {
unsigned int atom1 = x*TILE_SIZE+tgx;
float value = 0.0f;
float4 posq1 = posq[atom1];
real value = 0;
real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) {
float4 posq2 = local_posq[j];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
real4 posq2 = local_posq[j];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif
float r2 = dot(delta.xyz, delta.xyz);
float tempValue1 = 0.0f;
float tempValue2 = 0.0f;
real r2 = dot(delta.xyz, delta.xyz);
real tempValue1 = 0;
real tempValue2 = 0;
if (r2 < CUTOFF_SQUARED) {
float invR = RSQRT(r2);
float r = RECIP(invR);
real invR = RSQRT(r2);
real r = RECIP(invR);
unsigned int atom2 = j;
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
......@@ -177,8 +177,8 @@ __kernel void computeN2Value(__global const float4* restrict posq, __local float
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int atom1 = x*TILE_SIZE+tgx;
float value = 0.0f;
float4 posq1 = posq[atom1];
real value = 0;
real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
#ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndex+tgx] : 0xFFFFFFFF);
......@@ -187,22 +187,22 @@ __kernel void computeN2Value(__global const float4* restrict posq, __local float
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
float4 posq2 = local_posq[j];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
real4 posq2 = local_posq[j];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif
float r2 = dot(delta.xyz, delta.xyz);
real r2 = dot(delta.xyz, delta.xyz);
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
float invR = RSQRT(r2);
float r = RECIP(invR);
real invR = RSQRT(r2);
real r = RECIP(invR);
unsigned int atom2 = j;
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
float tempValue1 = 0.0f;
float tempValue2 = 0.0f;
real tempValue1 = 0;
real tempValue2 = 0;
#ifdef USE_EXCLUSIONS
if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
#else
......
......@@ -9,17 +9,17 @@
*/
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeN2Value(__global const float4* restrict posq, __local float4* restrict local_posq, __global const unsigned int* restrict exclusions,
void computeN2Value(__global const real4* restrict posq, __local real4* restrict local_posq, __global const unsigned int* restrict exclusions,
__global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices,
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict global_value,
#else
__global float* restrict global_value,
__global real* restrict global_value,
#endif
__local float* restrict local_value,
__local float* restrict tempBuffer,
__local real* restrict local_value,
__local real* restrict tempBuffer,
#ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles
#else
unsigned int numTiles
#endif
......@@ -32,7 +32,7 @@ void computeN2Value(__global const float4* restrict posq, __local float4* restri
unsigned int pos = get_group_id(0)*numTiles/get_num_groups(0);
unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
#endif
float energy = 0.0f;
real energy = 0;
unsigned int lasty = 0xFFFFFFFF;
__local unsigned int exclusionRange[2];
__local int exclusionIndex[1];
......@@ -60,8 +60,8 @@ void computeN2Value(__global const float4* restrict posq, __local float4* restri
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int valueBufferOffset = (tgx < TILE_SIZE/2 ? 0 : TILE_SIZE);
unsigned int atom1 = x*TILE_SIZE + tgx;
float value = 0.0f;
float4 posq1 = posq[atom1];
real value = 0;
real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile.
......@@ -93,23 +93,23 @@ void computeN2Value(__global const float4* restrict posq, __local float4* restri
bool isExcluded = !(excl & 0x1);
#endif
int atom2 = baseLocalAtom+j;
float4 posq2 = local_posq[atom2];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
real4 posq2 = local_posq[atom2];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
float invR = RSQRT(r2);
float r = RECIP(invR);
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+baseLocalAtom+j;
float tempValue1 = 0.0f;
float tempValue2 = 0.0f;
real tempValue1 = 0;
real tempValue2 = 0;
#ifdef USE_EXCLUSIONS
if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
#else
......@@ -154,7 +154,7 @@ void computeN2Value(__global const float4* restrict posq, __local float4* restri
const unsigned int localAtomIndex = get_local_id(0);
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
local_value[get_local_id(0)] = 0.0f;
local_value[get_local_id(0)] = 0;
barrier(CLK_LOCAL_MEM_FENCE);
// Compute the full set of interactions in this tile.
......@@ -171,23 +171,23 @@ void computeN2Value(__global const float4* restrict posq, __local float4* restri
bool isExcluded = !(excl & 0x1);
#endif
int atom2 = baseLocalAtom+tj;
float4 posq2 = local_posq[atom2];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
real4 posq2 = local_posq[atom2];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
float invR = RSQRT(r2);
float r = RECIP(invR);
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+baseLocalAtom+tj;
float tempValue1 = 0.0f;
float tempValue2 = 0.0f;
real tempValue1 = 0;
real tempValue2 = 0;
#ifdef USE_EXCLUSIONS
if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
#else
......
......@@ -7,16 +7,16 @@
/**
* Compute a value based on pair interactions.
*/
__kernel void computeN2Value(__global const float4* restrict posq, __local float4* restrict local_posq, __global const unsigned int* restrict exclusions,
__kernel void computeN2Value(__global const real4* restrict posq, __local real4* restrict local_posq, __global const unsigned int* restrict exclusions,
__global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices,
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict global_value,
#else
__global float* restrict global_value,
__global real* restrict global_value,
#endif
__local float* restrict local_value, __local float* restrict tempBuffer,
__local real* restrict local_value, __local real* restrict tempBuffer,
#ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
#else
unsigned int numTiles
#endif
......@@ -31,7 +31,7 @@ __kernel void computeN2Value(__global const float4* restrict posq, __local float
unsigned int pos = warp*numTiles/totalWarps;
unsigned int end = (warp+1)*numTiles/totalWarps;
#endif
float energy = 0.0f;
real energy = 0;
unsigned int lasty = 0xFFFFFFFF;
__local unsigned int exclusionRange[2*WARPS_PER_GROUP];
__local int exclusionIndex[WARPS_PER_GROUP];
......@@ -43,7 +43,7 @@ __kernel void computeN2Value(__global const float4* restrict posq, __local float
const unsigned int tbx = get_local_id(0) - tgx;
const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
unsigned int x, y;
float value = 0.0f;
real value = 0;
if (pos < end) {
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
......@@ -62,7 +62,7 @@ __kernel void computeN2Value(__global const float4* restrict posq, __local float
}
}
unsigned int atom1 = x*TILE_SIZE + tgx;
float4 posq1 = posq[atom1];
real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile.
......@@ -95,23 +95,23 @@ __kernel void computeN2Value(__global const float4* restrict posq, __local float
bool isExcluded = !(excl & 0x1);
#endif
int atom2 = tbx+j;
float4 posq2 = local_posq[atom2];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
real4 posq2 = local_posq[atom2];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
float invR = RSQRT(r2);
float r = RECIP(invR);
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
float tempValue1 = 0.0f;
float tempValue2 = 0.0f;
real tempValue1 = 0;
real tempValue2 = 0;
#ifdef USE_EXCLUSIONS
if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
#else
......@@ -137,7 +137,7 @@ __kernel void computeN2Value(__global const float4* restrict posq, __local float
const unsigned int localAtomIndex = get_local_id(0);
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
local_value[get_local_id(0)] = 0.0f;
local_value[get_local_id(0)] = 0;
#ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
if (!hasExclusions && flags != 0xFFFFFFFF) {
......@@ -150,19 +150,19 @@ __kernel void computeN2Value(__global const float4* restrict posq, __local float
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) {
int atom2 = tbx+j;
float4 posq2 = local_posq[atom2];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
real4 posq2 = local_posq[atom2];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float tempValue1 = 0.0f;
float tempValue2 = 0.0f;
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real tempValue1 = 0;
real tempValue2 = 0;
if (r2 < CUTOFF_SQUARED) {
float invR = RSQRT(r2);
float r = RECIP(invR);
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
......@@ -197,23 +197,23 @@ __kernel void computeN2Value(__global const float4* restrict posq, __local float
bool isExcluded = !(excl & 0x1);
#endif
int atom2 = tbx+tj;
float4 posq2 = local_posq[atom2];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
real4 posq2 = local_posq[atom2];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
float invR = RSQRT(r2);
float r = RECIP(invR);
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+tj;
float tempValue1 = 0.0f;
float tempValue2 = 0.0f;
real tempValue1 = 0;
real tempValue2 = 0;
#ifdef USE_EXCLUSIONS
if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
#else
......
......@@ -2,11 +2,11 @@
* Reduce a pairwise computed value, and compute per-particle values.
*/
__kernel void computePerParticleValues(int bufferSize, int numBuffers, __global float4* posq,
__kernel void computePerParticleValues(int bufferSize, int numBuffers, __global real4* posq,
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* valueBuffers
#else
__global float* valueBuffers
__global real* valueBuffers
#endif
PARAMETER_ARGUMENTS) {
unsigned int index = get_global_id(0);
......@@ -14,17 +14,17 @@ __kernel void computePerParticleValues(int bufferSize, int numBuffers, __global
// Reduce the pairwise value
#ifdef SUPPORTS_64_BIT_ATOMICS
float sum = (1.0f/0xFFFFFFFF)*valueBuffers[index];
real sum = (1.0f/0xFFFFFFFF)*valueBuffers[index];
#else
int totalSize = bufferSize*numBuffers;
float sum = valueBuffers[index];
real sum = valueBuffers[index];
for (int i = index+bufferSize; i < totalSize; i += bufferSize)
sum += valueBuffers[i];
#endif
// Now calculate other values
float4 pos = posq[index];
real4 pos = posq[index];
COMPUTE_VALUES
index += get_global_size(0);
}
......
......@@ -3,7 +3,7 @@ if (!isExcluded && r2 < CUTOFF_SQUARED) {
#else
if (!isExcluded) {
#endif
float tempForce = 0.0f;
real tempForce = 0.0f;
COMPUTE_FORCE
dEdR += tempForce*invR;
}
{
float invRSquaredOver4 = 0.25f*invR*invR;
float rScaledRadiusJ = r+obcParams2.y;
float rScaledRadiusI = r+obcParams1.y;
float l_ijJ = RECIP(max(obcParams1.x, fabs(r-obcParams2.y)));
float l_ijI = RECIP(max(obcParams2.x, fabs(r-obcParams1.y)));
float u_ijJ = RECIP(rScaledRadiusJ);
float u_ijI = RECIP(rScaledRadiusI);
float l_ij2J = l_ijJ*l_ijJ;
float l_ij2I = l_ijI*l_ijI;
float u_ij2J = u_ijJ*u_ijJ;
float u_ij2I = u_ijI*u_ijI;
float t1J = LOG(u_ijJ*RECIP(l_ijJ));
float t1I = LOG(u_ijI*RECIP(l_ijI));
float t2J = (l_ij2J-u_ij2J);
float t2I = (l_ij2I-u_ij2I);
float term1 = (0.5f*(0.25f+obcParams2.y*obcParams2.y*invRSquaredOver4)*t2J + t1J*invRSquaredOver4)*invR;
float term2 = (0.5f*(0.25f+obcParams1.y*obcParams1.y*invRSquaredOver4)*t2I + t1I*invRSquaredOver4)*invR;
float tempdEdR = select(0.0f, bornForce1*term1, obcParams1.x < rScaledRadiusJ);
tempdEdR += select(0.0f, bornForce2*term2, obcParams2.x < rScaledRadiusI);
real invRSquaredOver4 = 0.25f*invR*invR;
real rScaledRadiusJ = r+obcParams2.y;
real rScaledRadiusI = r+obcParams1.y;
real l_ijJ = RECIP(max((real) obcParams1.x, fabs(r-obcParams2.y)));
real l_ijI = RECIP(max((real) obcParams2.x, fabs(r-obcParams1.y)));
real u_ijJ = RECIP(rScaledRadiusJ);
real u_ijI = RECIP(rScaledRadiusI);
real l_ij2J = l_ijJ*l_ijJ;
real l_ij2I = l_ijI*l_ijI;
real u_ij2J = u_ijJ*u_ijJ;
real u_ij2I = u_ijI*u_ijI;
real t1J = LOG(u_ijJ*RECIP(l_ijJ));
real t1I = LOG(u_ijI*RECIP(l_ijI));
real t2J = (l_ij2J-u_ij2J);
real t2I = (l_ij2I-u_ij2I);
real term1 = (0.5f*(0.25f+obcParams2.y*obcParams2.y*invRSquaredOver4)*t2J + t1J*invRSquaredOver4)*invR;
real term2 = (0.5f*(0.25f+obcParams1.y*obcParams1.y*invRSquaredOver4)*t2I + t1I*invRSquaredOver4)*invR;
real tempdEdR = (obcParams1.x < rScaledRadiusJ ? bornForce1*term1 : (real) 0);
tempdEdR += (obcParams2.x < rScaledRadiusI ? bornForce2*term2 : (real) 0);
#ifdef USE_CUTOFF
unsigned int includeInteraction = (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2 && r2 < CUTOFF_SQUARED);
bool includeInteraction = (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2 && r2 < CUTOFF_SQUARED);
#else
unsigned int includeInteraction = (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2);
bool includeInteraction = (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2);
#endif
dEdR += select(0.0f, tempdEdR, includeInteraction);
dEdR += (includeInteraction ? tempdEdR : (real) 0);
}
......@@ -10,18 +10,18 @@ __kernel void reduceBornSum(int bufferSize, int numBuffers, float alpha, float b
#ifdef SUPPORTS_64_BIT_ATOMICS
__global const long* restrict bornSum,
#else
__global const float* restrict bornSum,
__global const real* restrict bornSum,
#endif
__global const float2* restrict params, __global float* restrict bornRadii, __global float* restrict obcChain) {
__global const float2* restrict params, __global real* restrict bornRadii, __global real* restrict obcChain) {
unsigned int index = get_global_id(0);
while (index < NUM_ATOMS) {
// Get summed Born data
int totalSize = bufferSize*numBuffers;
#ifdef SUPPORTS_64_BIT_ATOMICS
float sum = (1.0f/0xFFFFFFFF)*bornSum[index];
real sum = (1/(real) 0xFFFFFFFF)*bornSum[index];
#else
float sum = bornSum[index];
real sum = bornSum[index];
for (int i = index+bufferSize; i < totalSize; i += bufferSize)
sum += bornSum[i];
#endif
......@@ -30,13 +30,13 @@ __kernel void reduceBornSum(int bufferSize, int numBuffers, float alpha, float b
float offsetRadius = params[index].x;
sum *= 0.5f*offsetRadius;
float sum2 = sum*sum;
float sum3 = sum*sum2;
float tanhSum = tanh(alpha*sum - beta*sum2 + gamma*sum3);
float nonOffsetRadius = offsetRadius + DIELECTRIC_OFFSET;
float radius = 1.0f/(1.0f/offsetRadius - tanhSum/nonOffsetRadius);
float chain = offsetRadius*(alpha - 2.0f*beta*sum + 3.0f*gamma*sum2);
chain = (1.0f-tanhSum*tanhSum)*chain / nonOffsetRadius;
real sum2 = sum*sum;
real sum3 = sum*sum2;
real tanhSum = tanh(alpha*sum - beta*sum2 + gamma*sum3);
real nonOffsetRadius = offsetRadius + DIELECTRIC_OFFSET;
real radius = 1/(1/offsetRadius - tanhSum/nonOffsetRadius);
real chain = offsetRadius*(alpha - 2*beta*sum + 3*gamma*sum2);
chain = (1-tanhSum*tanhSum)*chain / nonOffsetRadius;
bornRadii[index] = radius;
obcChain[index] = chain;
index += get_global_size(0);
......@@ -47,31 +47,31 @@ __kernel void reduceBornSum(int bufferSize, int numBuffers, float alpha, float b
* Reduce the Born force.
*/
__kernel void reduceBornForce(int bufferSize, int numBuffers, __global float* bornForce,
__kernel void reduceBornForce(int bufferSize, int numBuffers, __global real* bornForce,
#ifdef SUPPORTS_64_BIT_ATOMICS
__global const long* restrict bornForceIn,
#endif
__global float* restrict energyBuffer, __global const float2* restrict params, __global const float* restrict bornRadii, __global const float* restrict obcChain) {
float energy = 0.0f;
__global real* restrict energyBuffer, __global const float2* restrict params, __global const real* restrict bornRadii, __global const real* restrict obcChain) {
real energy = 0.0f;
unsigned int index = get_global_id(0);
while (index < NUM_ATOMS) {
// Sum the Born force
int totalSize = bufferSize*numBuffers;
#ifdef SUPPORTS_64_BIT_ATOMICS
float force = (1.0f/0xFFFFFFFF)*bornForceIn[index];
real force = (1/(real) 0xFFFFFFFF)*bornForceIn[index];
#else
float force = bornForce[index];
real force = bornForce[index];
for (int i = index+bufferSize; i < totalSize; i += bufferSize)
force += bornForce[i];
#endif
// Now calculate the actual force
float offsetRadius = params[index].x;
float bornRadius = bornRadii[index];
float r = offsetRadius+DIELECTRIC_OFFSET+PROBE_RADIUS;
float ratio6 = pow((offsetRadius+DIELECTRIC_OFFSET)/bornRadius, 6.0f);
float saTerm = SURFACE_AREA_FACTOR*r*r*ratio6;
real bornRadius = bornRadii[index];
real r = offsetRadius+DIELECTRIC_OFFSET+PROBE_RADIUS;
real ratio6 = pow((offsetRadius+DIELECTRIC_OFFSET)/bornRadius, (real) 6);
real saTerm = SURFACE_AREA_FACTOR*r*r*ratio6;
force += saTerm/bornRadius;
energy += saTerm;
force *= bornRadius*bornRadius*obcChain[index];
......
#define TILE_SIZE 32
typedef struct {
float x, y, z;
float q;
real x, y, z;
real q;
float radius, scaledRadius;
float bornSum;
real bornSum;
} AtomData1;
/**
* Compute the Born sum.
*/
__kernel void computeBornSum(__global float* restrict global_bornSum, __global const float4* restrict posq, __global const float2* restrict global_params,
__kernel void computeBornSum(__global real* restrict global_bornSum, __global const real4* restrict posq, __global const float2* restrict global_params,
#ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags) {
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags) {
#else
unsigned int numTiles) {
#endif
......@@ -53,7 +53,7 @@ __kernel void computeBornSum(__global float* restrict global_bornSum, __global c
if (lasty != y) {
for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
unsigned int j = y*TILE_SIZE + localAtomIndex;
float4 tempPosq = posq[j];
real4 tempPosq = posq[j];
localData[localAtomIndex].x = tempPosq.x;
localData[localAtomIndex].y = tempPosq.y;
localData[localAtomIndex].z = tempPosq.z;
......@@ -68,31 +68,31 @@ __kernel void computeBornSum(__global float* restrict global_bornSum, __global c
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int atom1 = x*TILE_SIZE+tgx;
float bornSum = 0.0f;
float4 posq1 = posq[atom1];
real bornSum = 0.0f;
real4 posq1 = posq[atom1];
float2 params1 = global_params[atom1];
for (unsigned int j = 0; j < TILE_SIZE; j++) {
float4 posq2 = (float4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif
float r2 = dot(delta.xyz, delta.xyz);
real r2 = dot(delta.xyz, delta.xyz);
#ifdef USE_CUTOFF
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
#endif
float invR = RSQRT(r2);
float r = RECIP(invR);
real invR = RSQRT(r2);
real r = RECIP(invR);
float2 params2 = (float2) (localData[j].radius, localData[j].scaledRadius);
float rScaledRadiusJ = r+params2.y;
real rScaledRadiusJ = r+params2.y;
if ((j != tgx) && (params1.x < rScaledRadiusJ)) {
float l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
float u_ij = RECIP(rScaledRadiusJ);
float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij;
float ratio = LOG(u_ij * RECIP(l_ij));
real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
real u_ij = RECIP(rScaledRadiusJ);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2);
if (params1.x < params2.y-r)
......@@ -117,46 +117,46 @@ __kernel void computeBornSum(__global float* restrict global_bornSum, __global c
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int atom1 = x*TILE_SIZE+tgx;
float bornSum = 0.0f;
float4 posq1 = posq[atom1];
real bornSum = 0.0f;
real4 posq1 = posq[atom1];
float2 params1 = global_params[atom1];
for (unsigned int j = 0; j < TILE_SIZE; j++) {
float4 posq2 = (float4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif
float r2 = dot(delta.xyz, delta.xyz);
real r2 = dot(delta.xyz, delta.xyz);
#ifdef USE_CUTOFF
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
#endif
float invR = RSQRT(r2);
float r = RECIP(invR);
real invR = RSQRT(r2);
real r = RECIP(invR);
float2 params2 = (float2) (localData[j].radius, localData[j].scaledRadius);
float rScaledRadiusJ = r+params2.y;
real rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) {
float l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
float u_ij = RECIP(rScaledRadiusJ);
float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij;
float ratio = LOG(u_ij * RECIP(l_ij));
real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
real u_ij = RECIP(rScaledRadiusJ);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2);
if (params1.x < params2.y-r)
bornSum += 2.0f*(RECIP(params1.x)-l_ij);
}
float rScaledRadiusI = r+params1.y;
real rScaledRadiusI = r+params1.y;
if (params2.x < rScaledRadiusI) {
float l_ij = RECIP(max(params2.x, fabs(r-params1.y)));
float u_ij = RECIP(rScaledRadiusI);
float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij;
float ratio = LOG(u_ij * RECIP(l_ij));
float term = l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
real l_ij = RECIP(max((real) params2.x, fabs(r-params1.y)));
real u_ij = RECIP(rScaledRadiusI);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
real term = l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2);
if (params2.x < params1.y-r)
term += 2.0f*(RECIP(params2.x)-l_ij);
......@@ -184,20 +184,20 @@ __kernel void computeBornSum(__global float* restrict global_bornSum, __global c
}
typedef struct {
float x, y, z;
float q;
float fx, fy, fz, fw;
float bornRadius;
real x, y, z;
real q;
real fx, fy, fz, fw;
real bornRadius;
} AtomData2;
/**
* First part of computing the GBSA interaction.
*/
__kernel void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* restrict global_bornForce,
__global float* restrict energyBuffer, __global const float4* restrict posq, __global const float* restrict global_bornRadii,
__kernel void computeGBSAForce1(__global real4* restrict forceBuffers, __global real* restrict global_bornForce,
__global real* restrict energyBuffer, __global const real4* restrict posq, __global const real* restrict global_bornRadii,
#ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags) {
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags) {
#else
unsigned int numTiles) {
#endif
......@@ -209,7 +209,7 @@ __kernel void computeGBSAForce1(__global float4* restrict forceBuffers, __global
unsigned int pos = get_group_id(0)*numTiles/get_num_groups(0);
unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
#endif
float energy = 0.0f;
real energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF;
__local AtomData2 localData[TILE_SIZE];
......@@ -238,7 +238,7 @@ __kernel void computeGBSAForce1(__global float4* restrict forceBuffers, __global
if (lasty != y) {
for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
unsigned int j = y*TILE_SIZE + localAtomIndex;
float4 tempPosq = posq[j];
real4 tempPosq = posq[j];
localData[localAtomIndex].x = tempPosq.x;
localData[localAtomIndex].y = tempPosq.y;
localData[localAtomIndex].z = tempPosq.z;
......@@ -251,34 +251,34 @@ __kernel void computeGBSAForce1(__global float4* restrict forceBuffers, __global
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int atom1 = x*TILE_SIZE+tgx;
float4 force = 0.0f;
float4 posq1 = posq[atom1];
float bornRadius1 = global_bornRadii[atom1];
real4 force = 0.0f;
real4 posq1 = posq[atom1];
real bornRadius1 = global_bornRadii[atom1];
for (unsigned int j = 0; j < TILE_SIZE; j++) {
float4 posq2 = (float4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif
float r2 = dot(delta.xyz, delta.xyz);
real r2 = dot(delta.xyz, delta.xyz);
#ifdef USE_CUTOFF
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
#endif
float invR = RSQRT(r2);
float r = RECIP(invR);
float bornRadius2 = localData[j].bornRadius;
float alpha2_ij = bornRadius1*bornRadius2;
float D_ij = r2*RECIP(4.0f*alpha2_ij);
float expTerm = EXP(-D_ij);
float denominator2 = r2 + alpha2_ij*expTerm;
float denominator = SQRT(denominator2);
float tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
float Gpol = tempEnergy*RECIP(denominator2);
float dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
real invR = RSQRT(r2);
real r = RECIP(invR);
real bornRadius2 = localData[j].bornRadius;
real alpha2_ij = bornRadius1*bornRadius2;
real D_ij = r2*RECIP(4.0f*alpha2_ij);
real expTerm = EXP(-D_ij);
real denominator2 = r2 + alpha2_ij*expTerm;
real denominator = SQRT(denominator2);
real tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
real Gpol = tempEnergy*RECIP(denominator2);
real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
force.w += dGpol_dalpha2_ij*bornRadius2;
float dEdR = Gpol*(1.0f - 0.25f*expTerm);
real dEdR = Gpol*(1.0f - 0.25f*expTerm);
energy += 0.5f*tempEnergy;
force.xyz -= delta.xyz*dEdR;
}
......@@ -305,34 +305,34 @@ __kernel void computeGBSAForce1(__global float4* restrict forceBuffers, __global
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int atom1 = x*TILE_SIZE+tgx;
float4 force = 0.0f;
float4 posq1 = posq[atom1];
float bornRadius1 = global_bornRadii[atom1];
real4 force = 0.0f;
real4 posq1 = posq[atom1];
real bornRadius1 = global_bornRadii[atom1];
for (unsigned int j = 0; j < TILE_SIZE; j++) {
float4 posq2 = (float4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif
float r2 = dot(delta.xyz, delta.xyz);
real r2 = dot(delta.xyz, delta.xyz);
#ifdef USE_CUTOFF
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
#endif
float invR = RSQRT(r2);
float r = RECIP(invR);
float bornRadius2 = localData[j].bornRadius;
float alpha2_ij = bornRadius1*bornRadius2;
float D_ij = r2*RECIP(4.0f*alpha2_ij);
float expTerm = EXP(-D_ij);
float denominator2 = r2 + alpha2_ij*expTerm;
float denominator = SQRT(denominator2);
float tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
float Gpol = tempEnergy*RECIP(denominator2);
float dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
real invR = RSQRT(r2);
real r = RECIP(invR);
real bornRadius2 = localData[j].bornRadius;
real alpha2_ij = bornRadius1*bornRadius2;
real D_ij = r2*RECIP(4.0f*alpha2_ij);
real expTerm = EXP(-D_ij);
real denominator2 = r2 + alpha2_ij*expTerm;
real denominator = SQRT(denominator2);
real tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
real Gpol = tempEnergy*RECIP(denominator2);
real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
force.w += dGpol_dalpha2_ij*bornRadius2;
float dEdR = Gpol*(1.0f - 0.25f*expTerm);
real dEdR = Gpol*(1.0f - 0.25f*expTerm);
energy += tempEnergy;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
......@@ -354,7 +354,7 @@ __kernel void computeGBSAForce1(__global float4* restrict forceBuffers, __global
for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int offset = y*TILE_SIZE+tgx + get_group_id(0)*PADDED_NUM_ATOMS;
float4 f = forceBuffers[offset];
real4 f = forceBuffers[offset];
f.x += localData[tgx].fx;
f.y += localData[tgx].fy;
f.z += localData[tgx].fz;
......
......@@ -5,7 +5,7 @@
#define TILE_SIZE 32
typedef struct {
float x, y, z;
real x, y, z;
float radius, scaledRadius;
} AtomData1;
......@@ -18,11 +18,11 @@ void computeBornSum(
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict global_bornSum,
#else
__global float* restrict global_bornSum,
__global real* restrict global_bornSum,
#endif
__global const float4* restrict posq, __global const float2* restrict global_params,
__global const real4* restrict posq, __global const float2* restrict global_params,
#ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) {
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles) {
#else
unsigned int numTiles) {
#endif
......@@ -36,8 +36,8 @@ void computeBornSum(
#endif
unsigned int lasty = 0xFFFFFFFF;
__local AtomData1 localData[TILE_SIZE];
__local float localBornSum[FORCE_WORK_GROUP_SIZE];
__local float localTemp[TILE_SIZE];
__local real localBornSum[FORCE_WORK_GROUP_SIZE];
__local real localTemp[TILE_SIZE];
while (pos < end) {
// Extract the coordinates of this tile
......@@ -62,8 +62,8 @@ void computeBornSum(
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int localForceOffset = get_local_id(0) & ~(TILE_SIZE-1);
unsigned int atom1 = x*TILE_SIZE + tgx;
float bornSum = 0.0f;
float4 posq1 = posq[atom1];
real bornSum = 0.0f;
real4 posq1 = posq[atom1];
float2 params1 = global_params[atom1];
if (x == y) {
// This tile is on the diagonal.
......@@ -77,27 +77,27 @@ void computeBornSum(
}
barrier(CLK_LOCAL_MEM_FENCE);
for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
float4 delta = (float4) (localData[baseLocalAtom+j].x-posq1.x, localData[baseLocalAtom+j].y-posq1.y, localData[baseLocalAtom+j].z-posq1.z, 0.0f);
real4 delta = (float4) (localData[baseLocalAtom+j].x-posq1.x, localData[baseLocalAtom+j].y-posq1.y, localData[baseLocalAtom+j].z-posq1.z, 0);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float invR = RSQRT(r2);
float r = RECIP(invR);
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real invR = RSQRT(r2);
real r = RECIP(invR);
float2 params2 = (float2) (localData[baseLocalAtom+j].radius, localData[baseLocalAtom+j].scaledRadius);
float rScaledRadiusJ = r+params2.y;
real rScaledRadiusJ = r+params2.y;
#ifdef USE_CUTOFF
unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+baseLocalAtom+j < NUM_ATOMS && r2 < CUTOFF_SQUARED && (j+baseLocalAtom != tgx) && (params1.x < rScaledRadiusJ));
#else
unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+baseLocalAtom+j < NUM_ATOMS && (j+baseLocalAtom != tgx) && (params1.x < rScaledRadiusJ));
#endif
float l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
float u_ij = RECIP(rScaledRadiusJ);
float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij;
float ratio = LOG(u_ij * RECIP(l_ij));
real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
real u_ij = RECIP(rScaledRadiusJ);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
bornSum += select(0.0f, l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2), includeInteraction);
bornSum += select(0.0f, 2.0f*(RECIP(params1.x)-l_ij), includeInteraction && params1.x < params2.y-r);
......@@ -128,11 +128,11 @@ void computeBornSum(
if (lasty != y && get_local_id(0) < TILE_SIZE) {
unsigned int j = y*TILE_SIZE + tgx;
float4 tempPosq = posq[j];
real4 tempPosq = posq[j];
localData[get_local_id(0)].x = tempPosq.x;
localData[get_local_id(0)].y = tempPosq.y;
localData[get_local_id(0)].z = tempPosq.z;
float2 tempParams = global_params[j];
real2 tempParams = global_params[j];
localData[get_local_id(0)].radius = tempParams.x;
localData[get_local_id(0)].scaledRadius = tempParams.y;
}
......@@ -143,41 +143,41 @@ void computeBornSum(
unsigned int tj = (tgx+baseLocalAtom) & (TILE_SIZE-1);
for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
float4 delta = (float4) (localData[tj].x-posq1.x, localData[tj].y-posq1.y, localData[tj].z-posq1.z, 0.0f);
real4 delta = (real4) (localData[tj].x-posq1.x, localData[tj].y-posq1.y, localData[tj].z-posq1.z, 0);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS && r2 < CUTOFF_SQUARED);
#else
unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS);
#endif
float invR = RSQRT(r2);
float r = RECIP(invR);
real invR = RSQRT(r2);
real r = RECIP(invR);
float2 params2 = (float2) (localData[tj].radius, localData[tj].scaledRadius);
float rScaledRadiusJ = r+params2.y;
real rScaledRadiusJ = r+params2.y;
{
float l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
float u_ij = RECIP(rScaledRadiusJ);
float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij;
float ratio = LOG(u_ij * RECIP(l_ij));
real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
real u_ij = RECIP(rScaledRadiusJ);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
unsigned int includeTerm = (includeInteraction && params1.x < rScaledRadiusJ);
bornSum += select(0.0f, l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2), includeTerm);
bornSum += select(0.0f, 2.0f*(RECIP(params1.x)-l_ij), includeTerm && params1.x < params2.y-r);
}
float rScaledRadiusI = r+params1.y;
real rScaledRadiusI = r+params1.y;
{
float l_ij = RECIP(max(params2.x, fabs(r-params1.y)));
float u_ij = RECIP(rScaledRadiusI);
float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij;
float ratio = LOG(u_ij * RECIP(l_ij));
float term = l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
real l_ij = RECIP(max((real) params2.x, fabs(r-params1.y)));
real u_ij = RECIP(rScaledRadiusI);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
real term = l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2);
term += select(0.0f, 2.0f*(RECIP(params2.x)-l_ij), params2.x < params1.y-r);
localBornSum[tj+localForceOffset] += select(0.0f, term, includeInteraction && params2.x < rScaledRadiusI);
......@@ -206,8 +206,8 @@ void computeBornSum(
const unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
// Do both loads before both stores to minimize store-load waits.
float sum1 = global_bornSum[offset1];
float sum2 = global_bornSum[offset2];
real sum1 = global_bornSum[offset1];
real sum2 = global_bornSum[offset2];
sum1 += bornSum + localTemp[tgx];
sum2 += localBornSum[get_local_id(0)] + localBornSum[get_local_id(0)+TILE_SIZE];
global_bornSum[offset1] = sum1;
......@@ -222,15 +222,15 @@ void computeBornSum(
}
typedef struct {
float x, y, z, w;
float padding;
real x, y, z, w;
real padding;
} PaddedUnalignedFloat4;
typedef struct {
float x, y, z;
float q;
float bornRadius;
float temp_x, temp_y, temp_z, temp_w;
real x, y, z;
real q;
real bornRadius;
real temp_x, temp_y, temp_z, temp_w;
} AtomData2;
/**
......@@ -242,11 +242,11 @@ void computeGBSAForce1(
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict forceBuffers, __global long* restrict global_bornForce,
#else
__global float4* restrict forceBuffers, __global float* restrict global_bornForce,
__global real4* restrict forceBuffers, __global real* restrict global_bornForce,
#endif
__global float* restrict energyBuffer, __global const float4* restrict posq, __global const float* restrict global_bornRadii,
__global real* restrict energyBuffer, __global const real4* restrict posq, __global const real* restrict global_bornRadii,
#ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) {
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles) {
#else
unsigned int numTiles) {
#endif
......@@ -258,7 +258,7 @@ void computeGBSAForce1(
unsigned int pos = get_group_id(0)*numTiles/get_num_groups(0);
unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
#endif
float energy = 0.0f;
real energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF;
__local AtomData2 localData[TILE_SIZE];
__local PaddedUnalignedFloat4 localForce[FORCE_WORK_GROUP_SIZE];
......@@ -286,9 +286,9 @@ void computeGBSAForce1(
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int localForceOffset = get_local_id(0) & ~(TILE_SIZE-1);
unsigned int atom1 = x*TILE_SIZE + tgx;
float4 force = 0.0f;
float4 posq1 = posq[atom1];
float bornRadius1 = global_bornRadii[atom1];
real4 force = 0.0f;
real4 posq1 = posq[atom1];
real bornRadius1 = global_bornRadii[atom1];
if (x == y) {
// This tile is on the diagonal.
......@@ -302,26 +302,26 @@ void computeGBSAForce1(
barrier(CLK_LOCAL_MEM_FENCE);
for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+baseLocalAtom+j < NUM_ATOMS);
float4 posq2 = (float4) (localData[baseLocalAtom+j].x, localData[baseLocalAtom+j].y, localData[baseLocalAtom+j].z, localData[baseLocalAtom+j].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
real4 posq2 = (real4) (localData[baseLocalAtom+j].x, localData[baseLocalAtom+j].y, localData[baseLocalAtom+j].z, localData[baseLocalAtom+j].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float invR = RSQRT(r2);
float r = RECIP(invR);
float bornRadius2 = localData[baseLocalAtom+j].bornRadius;
float alpha2_ij = bornRadius1*bornRadius2;
float D_ij = r2*RECIP(4.0f*alpha2_ij);
float expTerm = EXP(-D_ij);
float denominator2 = r2 + alpha2_ij*expTerm;
float denominator = SQRT(denominator2);
float tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
float Gpol = tempEnergy*RECIP(denominator2);
float dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
float dEdR = Gpol*(1.0f - 0.25f*expTerm);
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real invR = RSQRT(r2);
real r = RECIP(invR);
real bornRadius2 = localData[baseLocalAtom+j].bornRadius;
real alpha2_ij = bornRadius1*bornRadius2;
real D_ij = r2*RECIP(4.0f*alpha2_ij);
real expTerm = EXP(-D_ij);
real denominator2 = r2 + alpha2_ij*expTerm;
real denominator = SQRT(denominator2);
real tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
real Gpol = tempEnergy*RECIP(denominator2);
real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
real dEdR = Gpol*(1.0f - 0.25f*expTerm);
#ifdef USE_CUTOFF
dEdR = select(dEdR, 0.0f, r2 > CUTOFF_SQUARED);
tempEnergy = select(tempEnergy, 0.0f, r2 > CUTOFF_SQUARED);
......@@ -355,9 +355,9 @@ void computeGBSAForce1(
#else
const unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
// Cheaper to load/store float4 than float3. Do all loads before all stores to minimize store-load waits.
float4 sum = forceBuffers[offset];
float global_sum = global_bornForce[offset];
// Cheaper to load/store real4 than real3. Do all loads before all stores to minimize store-load waits.
real4 sum = forceBuffers[offset];
real global_sum = global_bornForce[offset];
sum.x += force.x + localData[tgx].temp_x;
sum.y += force.y + localData[tgx].temp_y;
sum.z += force.z + localData[tgx].temp_z;
......@@ -373,7 +373,7 @@ void computeGBSAForce1(
if (lasty != y && get_local_id(0) < TILE_SIZE) {
unsigned int j = y*TILE_SIZE + tgx;
float4 tempPosq = posq[j];
real4 tempPosq = posq[j];
localData[get_local_id(0)].x = tempPosq.x;
localData[get_local_id(0)].y = tempPosq.y;
localData[get_local_id(0)].z = tempPosq.z;
......@@ -391,26 +391,26 @@ void computeGBSAForce1(
unsigned int tj = (tgx+baseLocalAtom) & (TILE_SIZE-1);
for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS);
float4 posq2 = (float4) (localData[tj].x, localData[tj].y, localData[tj].z, localData[tj].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
real4 posq2 = (real4) (localData[tj].x, localData[tj].y, localData[tj].z, localData[tj].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float invR = RSQRT(r2);
float r = RECIP(invR);
float bornRadius2 = localData[tj].bornRadius;
float alpha2_ij = bornRadius1*bornRadius2;
float D_ij = r2*RECIP(4.0f*alpha2_ij);
float expTerm = EXP(-D_ij);
float denominator2 = r2 + alpha2_ij*expTerm;
float denominator = SQRT(denominator2);
float tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
float Gpol = tempEnergy*RECIP(denominator2);
float dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
float dEdR = Gpol*(1.0f - 0.25f*expTerm);
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real invR = RSQRT(r2);
real r = RECIP(invR);
real bornRadius2 = localData[tj].bornRadius;
real alpha2_ij = bornRadius1*bornRadius2;
real D_ij = r2*RECIP(4.0f*alpha2_ij);
real expTerm = EXP(-D_ij);
real denominator2 = r2 + alpha2_ij*expTerm;
real denominator = SQRT(denominator2);
real tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
real Gpol = tempEnergy*RECIP(denominator2);
real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
real dEdR = Gpol*(1.0f - 0.25f*expTerm);
#ifdef USE_CUTOFF
dEdR = select(dEdR, 0.0f, r2 > CUTOFF_SQUARED);
tempEnergy = select(tempEnergy, 0.0f, r2 > CUTOFF_SQUARED);
......@@ -458,11 +458,11 @@ void computeGBSAForce1(
const unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
const unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
// Cheaper to load/store float4 than float3. Do all loads before all stores to minimize store-load waits.
float4 sum1 = forceBuffers[offset1];
float4 sum2 = forceBuffers[offset2];
float global_sum1 = global_bornForce[offset1];
float global_sum2 = global_bornForce[offset2];
// Cheaper to load/store real4 than real3. Do all loads before all stores to minimize store-load waits.
real4 sum1 = forceBuffers[offset1];
real4 sum2 = forceBuffers[offset2];
real global_sum1 = global_bornForce[offset1];
real global_sum2 = global_bornForce[offset2];
sum1.x += force.x + localData[tgx].temp_x;
sum1.y += force.y + localData[tgx].temp_y;
sum1.z += force.z + localData[tgx].temp_z;
......
......@@ -6,10 +6,10 @@
#define WARPS_PER_GROUP (FORCE_WORK_GROUP_SIZE/TILE_SIZE)
typedef struct {
float x, y, z;
float q;
real x, y, z;
real q;
float radius, scaledRadius;
float bornSum;
real bornSum;
} AtomData1;
/**
......@@ -19,11 +19,11 @@ __kernel void computeBornSum(
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict global_bornSum,
#else
__global float* restrict global_bornSum,
__global real* restrict global_bornSum,
#endif
__global const float4* restrict posq, __global const float2* restrict global_params,
__global const real4* restrict posq, __global const float2* restrict global_params,
#ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags,
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags,
#else
unsigned int numTiles,
#endif
......@@ -40,7 +40,7 @@ __kernel void computeBornSum(
#endif
unsigned int lasty = 0xFFFFFFFF;
__local AtomData1 localData[FORCE_WORK_GROUP_SIZE];
__local float tempBuffer[FORCE_WORK_GROUP_SIZE];
__local real tempBuffer[FORCE_WORK_GROUP_SIZE];
__local int2 reservedBlocks[WARPS_PER_GROUP];
__local unsigned int* exclusionRange = (__local unsigned int*) reservedBlocks;
__local int exclusionIndex[WARPS_PER_GROUP];
......@@ -51,7 +51,7 @@ __kernel void computeBornSum(
const unsigned int tbx = get_local_id(0) - tgx;
const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
unsigned int x, y;
float bornSum = 0.0f;
real bornSum = 0.0f;
if (pos < end) {
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
......@@ -70,7 +70,7 @@ __kernel void computeBornSum(
}
}
unsigned int atom1 = x*TILE_SIZE + tgx;
float4 posq1 = posq[atom1];
real4 posq1 = posq[atom1];
float2 params1 = global_params[atom1];
if (pos >= end)
; // This warp is done.
......@@ -84,28 +84,28 @@ __kernel void computeBornSum(
localData[get_local_id(0)].radius = params1.x;
localData[get_local_id(0)].scaledRadius = params1.y;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
float4 delta = (float4) (localData[tbx+j].x-posq1.x, localData[tbx+j].y-posq1.y, localData[tbx+j].z-posq1.z, 0.0f);
real4 delta = (real4) (localData[tbx+j].x-posq1.x, localData[tbx+j].y-posq1.y, localData[tbx+j].z-posq1.z, 0);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
#endif
float invR = RSQRT(r2);
float r = RECIP(invR);
real invR = RSQRT(r2);
real r = RECIP(invR);
float2 params2 = (float2) (localData[tbx+j].radius, localData[tbx+j].scaledRadius);
float rScaledRadiusJ = r+params2.y;
real rScaledRadiusJ = r+params2.y;
if ((j != tgx) && (params1.x < rScaledRadiusJ)) {
float l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
float u_ij = RECIP(rScaledRadiusJ);
float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij;
float ratio = LOG(u_ij * RECIP(l_ij));
real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
real u_ij = RECIP(rScaledRadiusJ);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
(params2.y*params2.y*invR)*(l_ij2-u_ij2));
if (params1.x < params2.y-r)
......@@ -119,7 +119,7 @@ __kernel void computeBornSum(
if (lasty != y) {
unsigned int j = y*TILE_SIZE + tgx;
float4 tempPosq = posq[j];
real4 tempPosq = posq[j];
localData[get_local_id(0)].x = tempPosq.x;
localData[get_local_id(0)].y = tempPosq.y;
localData[get_local_id(0)].z = tempPosq.z;
......@@ -151,42 +151,42 @@ __kernel void computeBornSum(
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) {
float4 delta = (float4) (localData[tbx+j].x-posq1.x, localData[tbx+j].y-posq1.y, localData[tbx+j].z-posq1.z, 0.0f);
real4 delta = (real4) (localData[tbx+j].x-posq1.x, localData[tbx+j].y-posq1.y, localData[tbx+j].z-posq1.z, 0);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
tempBuffer[get_local_id(0)] = 0.0f;
#ifdef USE_CUTOFF
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
#endif
float invR = RSQRT(r2);
float r = RECIP(invR);
real invR = RSQRT(r2);
real r = RECIP(invR);
float2 params2 = (float2) (localData[tbx+j].radius, localData[tbx+j].scaledRadius);
float rScaledRadiusJ = r+params2.y;
real rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) {
float l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
float u_ij = RECIP(rScaledRadiusJ);
float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij;
float ratio = LOG(u_ij * RECIP(l_ij));
real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
real u_ij = RECIP(rScaledRadiusJ);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
(params2.y*params2.y*invR)*(l_ij2-u_ij2));
if (params1.x < params2.y-r)
bornSum += 2.0f*(RECIP(params1.x)-l_ij);
}
float rScaledRadiusI = r+params1.y;
real rScaledRadiusI = r+params1.y;
if (params2.x < rScaledRadiusI) {
float l_ij = RECIP(max(params2.x, fabs(r-params1.y)));
float u_ij = RECIP(rScaledRadiusI);
float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij;
float ratio = LOG(u_ij * RECIP(l_ij));
float term = l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
real l_ij = RECIP(max((real) params2.x, fabs(r-params1.y)));
real u_ij = RECIP(rScaledRadiusI);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
real term = l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
(params1.y*params1.y*invR)*(l_ij2-u_ij2));
if (params2.x < params1.y-r)
term += 2.0f*(RECIP(params2.x)-l_ij);
......@@ -211,41 +211,41 @@ __kernel void computeBornSum(
unsigned int tj = tgx;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
float4 delta = (float4) (localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z, 0.0f);
real4 delta = (real4) (localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z, 0);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS) {
#endif
float invR = RSQRT(r2);
float r = RECIP(invR);
real invR = RSQRT(r2);
real r = RECIP(invR);
float2 params2 = (float2) (localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
float rScaledRadiusJ = r+params2.y;
real rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) {
float l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
float u_ij = RECIP(rScaledRadiusJ);
float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij;
float ratio = LOG(u_ij * RECIP(l_ij));
real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
real u_ij = RECIP(rScaledRadiusJ);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
(params2.y*params2.y*invR)*(l_ij2-u_ij2));
if (params1.x < params2.y-r)
bornSum += 2.0f*(RECIP(params1.x)-l_ij);
}
float rScaledRadiusI = r+params1.y;
real rScaledRadiusI = r+params1.y;
if (params2.x < rScaledRadiusI) {
float l_ij = RECIP(max(params2.x, fabs(r-params1.y)));
float u_ij = RECIP(rScaledRadiusI);
float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij;
float ratio = LOG(u_ij * RECIP(l_ij));
float term = l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
real l_ij = RECIP(max((real) params2.x, fabs(r-params1.y)));
real u_ij = RECIP(rScaledRadiusI);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
real term = l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
(params1.y*params1.y*invR)*(l_ij2-u_ij2));
if (params2.x < params1.y-r)
term += 2.0f*(RECIP(params2.x)-l_ij);
......@@ -327,10 +327,10 @@ __kernel void computeBornSum(
}
typedef struct {
float x, y, z;
float q;
float fx, fy, fz, fw;
float bornRadius;
real x, y, z;
real q;
real fx, fy, fz, fw;
real bornRadius;
} AtomData2;
/**
......@@ -341,11 +341,11 @@ __kernel void computeGBSAForce1(
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict forceBuffers, __global long* restrict global_bornForce,
#else
__global float4* restrict forceBuffers, __global float* restrict global_bornForce,
__global real4* restrict forceBuffers, __global real* restrict global_bornForce,
#endif
__global float* restrict energyBuffer, __global const float4* restrict posq, __global const float* restrict global_bornRadii,
__global real* restrict energyBuffer, __global const real4* restrict posq, __global const real* restrict global_bornRadii,
#ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags,
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags,
#else
unsigned int numTiles,
#endif
......@@ -360,10 +360,10 @@ __kernel void computeGBSAForce1(
unsigned int pos = warp*numTiles/totalWarps;
unsigned int end = (warp+1)*numTiles/totalWarps;
#endif
float energy = 0.0f;
real energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF;
__local AtomData2 localData[FORCE_WORK_GROUP_SIZE];
__local float4 tempBuffer[FORCE_WORK_GROUP_SIZE];
__local real4 tempBuffer[FORCE_WORK_GROUP_SIZE];
__local int2 reservedBlocks[WARPS_PER_GROUP];
__local unsigned int* exclusionRange = (__local unsigned int*) reservedBlocks;
__local int exclusionIndex[WARPS_PER_GROUP];
......@@ -374,7 +374,7 @@ __kernel void computeGBSAForce1(
const unsigned int tbx = get_local_id(0) - tgx;
const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
unsigned int x, y;
float4 force = 0.0f;
real4 force = 0.0f;
if (pos < end) {
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
......@@ -393,8 +393,8 @@ __kernel void computeGBSAForce1(
}
}
unsigned int atom1 = x*TILE_SIZE + tgx;
float4 posq1 = posq[atom1];
float bornRadius1 = global_bornRadii[atom1];
real4 posq1 = posq[atom1];
real bornRadius1 = global_bornRadii[atom1];
if (x == y) {
// This tile is on the diagonal.
......@@ -405,29 +405,29 @@ __kernel void computeGBSAForce1(
localData[get_local_id(0)].bornRadius = bornRadius1;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
float4 posq2 = (float4) (localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z, localData[tbx+j].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
real4 posq2 = (real4) (localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z, localData[tbx+j].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
float invR = RSQRT(r2);
float r = RECIP(invR);
float bornRadius2 = localData[tbx+j].bornRadius;
float alpha2_ij = bornRadius1*bornRadius2;
float D_ij = r2*RECIP(4.0f*alpha2_ij);
float expTerm = EXP(-D_ij);
float denominator2 = r2 + alpha2_ij*expTerm;
float denominator = SQRT(denominator2);
float tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
float Gpol = tempEnergy*RECIP(denominator2);
float dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
float dEdR = Gpol*(1.0f - 0.25f*expTerm);
real invR = RSQRT(r2);
real r = RECIP(invR);
real bornRadius2 = localData[tbx+j].bornRadius;
real alpha2_ij = bornRadius1*bornRadius2;
real D_ij = r2*RECIP(4.0f*alpha2_ij);
real expTerm = EXP(-D_ij);
real denominator2 = r2 + alpha2_ij*expTerm;
real denominator = SQRT(denominator2);
real tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
real Gpol = tempEnergy*RECIP(denominator2);
real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
real dEdR = Gpol*(1.0f - 0.25f*expTerm);
force.w += dGpol_dalpha2_ij*bornRadius2;
energy += 0.5f*tempEnergy;
delta.xyz *= dEdR;
......@@ -443,7 +443,7 @@ __kernel void computeGBSAForce1(
if (lasty != y) {
unsigned int j = y*TILE_SIZE + tgx;
float4 tempPosq = posq[j];
real4 tempPosq = posq[j];
localData[get_local_id(0)].x = tempPosq.x;
localData[get_local_id(0)].y = tempPosq.y;
localData[get_local_id(0)].z = tempPosq.z;
......@@ -480,29 +480,29 @@ __kernel void computeGBSAForce1(
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) {
float4 posq2 = (float4) (localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z, localData[tbx+j].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
real4 posq2 = (real4) (localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z, localData[tbx+j].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
float invR = RSQRT(r2);
float r = RECIP(invR);
float bornRadius2 = localData[tbx+j].bornRadius;
float alpha2_ij = bornRadius1*bornRadius2;
float D_ij = r2*RECIP(4.0f*alpha2_ij);
float expTerm = EXP(-D_ij);
float denominator2 = r2 + alpha2_ij*expTerm;
float denominator = SQRT(denominator2);
float tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
float Gpol = tempEnergy*RECIP(denominator2);
float dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
float dEdR = Gpol*(1.0f - 0.25f*expTerm);
real invR = RSQRT(r2);
real r = RECIP(invR);
real bornRadius2 = localData[tbx+j].bornRadius;
real alpha2_ij = bornRadius1*bornRadius2;
real D_ij = r2*RECIP(4.0f*alpha2_ij);
real expTerm = EXP(-D_ij);
real denominator2 = r2 + alpha2_ij*expTerm;
real denominator = SQRT(denominator2);
real tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
real Gpol = tempEnergy*RECIP(denominator2);
real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
real dEdR = Gpol*(1.0f - 0.25f*expTerm);
#ifdef USE_CUTOFF
if (atom1 >= NUM_ATOMS || y*TILE_SIZE+j >= NUM_ATOMS || r2 > CUTOFF_SQUARED) {
#else
......@@ -516,11 +516,11 @@ __kernel void computeGBSAForce1(
force.w += dGpol_dalpha2_ij*bornRadius2;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
tempBuffer[get_local_id(0)] = (float4) (delta.xyz, dGpol_dalpha2_ij*bornRadius1);
tempBuffer[get_local_id(0)] = (real4) (delta.xyz, dGpol_dalpha2_ij*bornRadius1);
#ifdef USE_CUTOFF
}
else
tempBuffer[get_local_id(0)] = (float4) 0.0f;
tempBuffer[get_local_id(0)] = (real4) 0;
#endif
// Sum the forces on atom j.
......@@ -528,7 +528,7 @@ __kernel void computeGBSAForce1(
if (tgx % 4 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+1]+tempBuffer[get_local_id(0)+2]+tempBuffer[get_local_id(0)+3];
if (tgx == 0) {
float4 sum = tempBuffer[get_local_id(0)]+tempBuffer[get_local_id(0)+4]+tempBuffer[get_local_id(0)+8]+tempBuffer[get_local_id(0)+12]+tempBuffer[get_local_id(0)+16]+tempBuffer[get_local_id(0)+20]+tempBuffer[get_local_id(0)+24]+tempBuffer[get_local_id(0)+28];
real4 sum = tempBuffer[get_local_id(0)]+tempBuffer[get_local_id(0)+4]+tempBuffer[get_local_id(0)+8]+tempBuffer[get_local_id(0)+12]+tempBuffer[get_local_id(0)+16]+tempBuffer[get_local_id(0)+20]+tempBuffer[get_local_id(0)+24]+tempBuffer[get_local_id(0)+28];
localData[tbx+j].fx += sum.x;
localData[tbx+j].fy += sum.y;
localData[tbx+j].fz += sum.z;
......@@ -546,29 +546,29 @@ __kernel void computeGBSAForce1(
unsigned int tj = tgx;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS) {
float4 posq2 = (float4) (localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z, localData[tbx+tj].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
real4 posq2 = (real4) (localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z, localData[tbx+tj].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
float invR = RSQRT(r2);
float r = RECIP(invR);
float bornRadius2 = localData[tbx+tj].bornRadius;
float alpha2_ij = bornRadius1*bornRadius2;
float D_ij = r2*RECIP(4.0f*alpha2_ij);
float expTerm = EXP(-D_ij);
float denominator2 = r2 + alpha2_ij*expTerm;
float denominator = SQRT(denominator2);
float tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
float Gpol = tempEnergy*RECIP(denominator2);
float dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
float dEdR = Gpol*(1.0f - 0.25f*expTerm);
real invR = RSQRT(r2);
real r = RECIP(invR);
real bornRadius2 = localData[tbx+tj].bornRadius;
real alpha2_ij = bornRadius1*bornRadius2;
real D_ij = r2*RECIP(4.0f*alpha2_ij);
real expTerm = EXP(-D_ij);
real denominator2 = r2 + alpha2_ij*expTerm;
real denominator = SQRT(denominator2);
real tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
real Gpol = tempEnergy*RECIP(denominator2);
real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
real dEdR = Gpol*(1.0f - 0.25f*expTerm);
force.w += dGpol_dalpha2_ij*bornRadius2;
energy += tempEnergy;
delta.xyz *= dEdR;
......@@ -648,7 +648,7 @@ __kernel void computeGBSAForce1(
}
if (writeY > -1) {
const unsigned int offset = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset] += (float4) (localData[get_local_id(0)].fx, localData[get_local_id(0)].fy, localData[get_local_id(0)].fz, 0.0f);
forceBuffers[offset] += (real4) (localData[get_local_id(0)].fx, localData[get_local_id(0)].fy, localData[get_local_id(0)].fz, 0);
global_bornForce[offset] += localData[get_local_id(0)].fw;
}
done = true;
......
......@@ -2,7 +2,7 @@
* Scale the particle positions.
*/
__kernel void scalePositions(float scale, int numMolecules, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global float4* restrict posq,
__kernel void scalePositions(float scale, int numMolecules, real4 periodicBoxSize, real4 invPeriodicBoxSize, __global real4* restrict posq,
__global const int* restrict moleculeAtoms, __global const int* restrict moleculeStartIndex) {
for (int index = get_global_id(0); index < numMolecules; index += get_global_size(0)) {
int first = moleculeStartIndex[index];
......@@ -11,24 +11,24 @@ __kernel void scalePositions(float scale, int numMolecules, float4 periodicBoxSi
// Find the center of each molecule.
float4 center = (float4) 0.0f;
real4 center = (real4) 0;
for (int atom = first; atom < last; atom++)
center += posq[moleculeAtoms[atom]];
center /= (float) numAtoms;
center /= (real) numAtoms;
// Move it into the first periodic box.
int xcell = (int) floor(center.x*invPeriodicBoxSize.x);
int ycell = (int) floor(center.y*invPeriodicBoxSize.y);
int zcell = (int) floor(center.z*invPeriodicBoxSize.z);
float4 delta = (float4) (xcell*periodicBoxSize.x, ycell*periodicBoxSize.y, zcell*periodicBoxSize.z, 0);
real4 delta = (real4) (xcell*periodicBoxSize.x, ycell*periodicBoxSize.y, zcell*periodicBoxSize.z, 0);
center -= delta;
// Now scale the position of the molecule center.
delta = center*(scale-1)-delta;
for (int atom = first; atom < last; atom++) {
float4 pos = posq[moleculeAtoms[atom]];
real4 pos = posq[moleculeAtoms[atom]];
pos.xyz += delta.xyz;
posq[moleculeAtoms[atom]] = pos;
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment