"platforms/reference/src/SimTKReference/ObcParameters.cpp" did not exist on "b245e77212179e9cb0060f7f4cff60970519fccf"
Commit 59f8cf87 authored by Peter Eastman's avatar Peter Eastman
Browse files

Finished implementing double precision for OpenCL

parent c8dac206
...@@ -575,11 +575,11 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c ...@@ -575,11 +575,11 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
// Build the list of virtual sites. // Build the list of virtual sites.
vector<mm_int4> vsite2AvgAtomVec; vector<mm_int4> vsite2AvgAtomVec;
vector<mm_float2> vsite2AvgWeightVec; vector<mm_double2> vsite2AvgWeightVec;
vector<mm_int4> vsite3AvgAtomVec; vector<mm_int4> vsite3AvgAtomVec;
vector<mm_float4> vsite3AvgWeightVec; vector<mm_double4> vsite3AvgWeightVec;
vector<mm_int4> vsiteOutOfPlaneAtomVec; vector<mm_int4> vsiteOutOfPlaneAtomVec;
vector<mm_float4> vsiteOutOfPlaneWeightVec; vector<mm_double4> vsiteOutOfPlaneWeightVec;
for (int i = 0; i < numAtoms; i++) { for (int i = 0; i < numAtoms; i++) {
if (system.isVirtualSite(i)) { if (system.isVirtualSite(i)) {
if (dynamic_cast<const TwoParticleAverageSite*>(&system.getVirtualSite(i)) != NULL) { if (dynamic_cast<const TwoParticleAverageSite*>(&system.getVirtualSite(i)) != NULL) {
...@@ -587,21 +587,21 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c ...@@ -587,21 +587,21 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
const TwoParticleAverageSite& site = dynamic_cast<const TwoParticleAverageSite&>(system.getVirtualSite(i)); const TwoParticleAverageSite& site = dynamic_cast<const TwoParticleAverageSite&>(system.getVirtualSite(i));
vsite2AvgAtomVec.push_back(mm_int4(i, site.getParticle(0), site.getParticle(1), 0)); vsite2AvgAtomVec.push_back(mm_int4(i, site.getParticle(0), site.getParticle(1), 0));
vsite2AvgWeightVec.push_back(mm_float2((float) site.getWeight(0), (float) site.getWeight(1))); vsite2AvgWeightVec.push_back(mm_double2(site.getWeight(0), site.getWeight(1)));
} }
else if (dynamic_cast<const ThreeParticleAverageSite*>(&system.getVirtualSite(i)) != NULL) { else if (dynamic_cast<const ThreeParticleAverageSite*>(&system.getVirtualSite(i)) != NULL) {
// A three particle average. // A three particle average.
const ThreeParticleAverageSite& site = dynamic_cast<const ThreeParticleAverageSite&>(system.getVirtualSite(i)); const ThreeParticleAverageSite& site = dynamic_cast<const ThreeParticleAverageSite&>(system.getVirtualSite(i));
vsite3AvgAtomVec.push_back(mm_int4(i, site.getParticle(0), site.getParticle(1), site.getParticle(2))); vsite3AvgAtomVec.push_back(mm_int4(i, site.getParticle(0), site.getParticle(1), site.getParticle(2)));
vsite3AvgWeightVec.push_back(mm_float4((float) site.getWeight(0), (float) site.getWeight(1), (float) site.getWeight(2), 0.0f)); vsite3AvgWeightVec.push_back(mm_double4(site.getWeight(0), site.getWeight(1), site.getWeight(2), 0.0));
} }
else if (dynamic_cast<const OutOfPlaneSite*>(&system.getVirtualSite(i)) != NULL) { else if (dynamic_cast<const OutOfPlaneSite*>(&system.getVirtualSite(i)) != NULL) {
// An out of plane site. // An out of plane site.
const OutOfPlaneSite& site = dynamic_cast<const OutOfPlaneSite&>(system.getVirtualSite(i)); const OutOfPlaneSite& site = dynamic_cast<const OutOfPlaneSite&>(system.getVirtualSite(i));
vsiteOutOfPlaneAtomVec.push_back(mm_int4(i, site.getParticle(0), site.getParticle(1), site.getParticle(2))); vsiteOutOfPlaneAtomVec.push_back(mm_int4(i, site.getParticle(0), site.getParticle(1), site.getParticle(2)));
vsiteOutOfPlaneWeightVec.push_back(mm_float4((float) site.getWeight12(), (float) site.getWeight13(), (float) site.getWeightCross(), 0.0f)); vsiteOutOfPlaneWeightVec.push_back(mm_double4(site.getWeight12(), site.getWeight13(), site.getWeightCross(), 0.0));
} }
} }
} }
...@@ -609,22 +609,47 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c ...@@ -609,22 +609,47 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
int num3Avg = vsite3AvgAtomVec.size(); int num3Avg = vsite3AvgAtomVec.size();
int numOutOfPlane = vsiteOutOfPlaneAtomVec.size(); int numOutOfPlane = vsiteOutOfPlaneAtomVec.size();
vsite2AvgAtoms = OpenCLArray::create<mm_int4>(context, max(1, num2Avg), "vsite2AvgAtoms"); vsite2AvgAtoms = OpenCLArray::create<mm_int4>(context, max(1, num2Avg), "vsite2AvgAtoms");
vsite2AvgWeights = OpenCLArray::create<mm_float2>(context, max(1, num2Avg), "vsite2AvgWeights");
vsite3AvgAtoms = OpenCLArray::create<mm_int4>(context, max(1, num3Avg), "vsite3AvgAtoms"); vsite3AvgAtoms = OpenCLArray::create<mm_int4>(context, max(1, num3Avg), "vsite3AvgAtoms");
vsite3AvgWeights = OpenCLArray::create<mm_float4>(context, max(1, num3Avg), "vsite3AvgWeights");
vsiteOutOfPlaneAtoms = OpenCLArray::create<mm_int4>(context, max(1, numOutOfPlane), "vsiteOutOfPlaneAtoms"); vsiteOutOfPlaneAtoms = OpenCLArray::create<mm_int4>(context, max(1, numOutOfPlane), "vsiteOutOfPlaneAtoms");
vsiteOutOfPlaneWeights = OpenCLArray::create<mm_float4>(context, max(1, numOutOfPlane), "vsiteOutOfPlaneWeights"); if (num2Avg > 0)
if (num2Avg > 0) {
vsite2AvgAtoms->upload(vsite2AvgAtomVec); vsite2AvgAtoms->upload(vsite2AvgAtomVec);
vsite2AvgWeights->upload(vsite2AvgWeightVec); if (num3Avg > 0)
}
if (num3Avg > 0) {
vsite3AvgAtoms->upload(vsite3AvgAtomVec); vsite3AvgAtoms->upload(vsite3AvgAtomVec);
vsite3AvgWeights->upload(vsite3AvgWeightVec); if (numOutOfPlane > 0)
}
if (numOutOfPlane > 0) {
vsiteOutOfPlaneAtoms->upload(vsiteOutOfPlaneAtomVec); vsiteOutOfPlaneAtoms->upload(vsiteOutOfPlaneAtomVec);
vsiteOutOfPlaneWeights->upload(vsiteOutOfPlaneWeightVec); if (context.getUseDoublePrecision()) {
vsite2AvgWeights = OpenCLArray::create<mm_double2>(context, max(1, num2Avg), "vsite2AvgWeights");
vsite3AvgWeights = OpenCLArray::create<mm_double4>(context, max(1, num3Avg), "vsite3AvgWeights");
vsiteOutOfPlaneWeights = OpenCLArray::create<mm_double4>(context, max(1, numOutOfPlane), "vsiteOutOfPlaneWeights");
if (num2Avg > 0)
vsite2AvgWeights->upload(vsite2AvgWeightVec);
if (num3Avg > 0)
vsite3AvgWeights->upload(vsite3AvgWeightVec);
if (numOutOfPlane > 0)
vsiteOutOfPlaneWeights->upload(vsiteOutOfPlaneWeightVec);
}
else {
vsite2AvgWeights = OpenCLArray::create<mm_float2>(context, max(1, num2Avg), "vsite2AvgWeights");
vsite3AvgWeights = OpenCLArray::create<mm_float4>(context, max(1, num3Avg), "vsite3AvgWeights");
vsiteOutOfPlaneWeights = OpenCLArray::create<mm_float4>(context, max(1, numOutOfPlane), "vsiteOutOfPlaneWeights");
if (num2Avg > 0) {
vector<mm_float2> floatWeights(num2Avg);
for (int i = 0; i < num2Avg; i++)
floatWeights[i] = mm_float2((float) vsite2AvgWeightVec[i].x, (float) vsite2AvgWeightVec[i].y);
vsite2AvgWeights->upload(floatWeights);
}
if (num3Avg > 0) {
vector<mm_float4> floatWeights(num3Avg);
for (int i = 0; i < num3Avg; i++)
floatWeights[i] = mm_float4((float) vsite3AvgWeightVec[i].x, (float) vsite3AvgWeightVec[i].y, (float) vsite3AvgWeightVec[i].z, 0.0f);
vsite3AvgWeights->upload(floatWeights);
}
if (numOutOfPlane > 0) {
vector<mm_float4> floatWeights(numOutOfPlane);
for (int i = 0; i < numOutOfPlane; i++)
floatWeights[i] = mm_float4((float) vsiteOutOfPlaneWeightVec[i].x, (float) vsiteOutOfPlaneWeightVec[i].y, (float) vsiteOutOfPlaneWeightVec[i].z, 0.0f);
vsiteOutOfPlaneWeights->upload(floatWeights);
}
} }
// Create the kernels for virtual sites. // Create the kernels for virtual sites.
......
...@@ -2006,33 +2006,40 @@ void OpenCLCalcGBSAOBCForceKernel::initialize(const System& system, const GBSAOB ...@@ -2006,33 +2006,40 @@ void OpenCLCalcGBSAOBCForceKernel::initialize(const System& system, const GBSAOB
throw OpenMMException("GBSAOBCForce does not support using multiple OpenCL devices"); throw OpenMMException("GBSAOBCForce does not support using multiple OpenCL devices");
OpenCLNonbondedUtilities& nb = cl.getNonbondedUtilities(); OpenCLNonbondedUtilities& nb = cl.getNonbondedUtilities();
params = OpenCLArray::create<mm_float2>(cl, cl.getPaddedNumAtoms(), "gbsaObcParams"); params = OpenCLArray::create<mm_float2>(cl, cl.getPaddedNumAtoms(), "gbsaObcParams");
bornRadii = OpenCLArray::create<cl_float>(cl, cl.getPaddedNumAtoms(), "bornRadii"); int elementSize = (cl.getUseDoublePrecision() ? sizeof(cl_double) : sizeof(cl_float));
obcChain = OpenCLArray::create<cl_float>(cl, cl.getPaddedNumAtoms(), "obcChain"); bornRadii = new OpenCLArray(cl, cl.getPaddedNumAtoms(), elementSize, "bornRadii");
obcChain = new OpenCLArray(cl, cl.getPaddedNumAtoms(), elementSize, "obcChain");
if (cl.getSupports64BitGlobalAtomics()) { if (cl.getSupports64BitGlobalAtomics()) {
longBornSum = OpenCLArray::create<cl_long>(cl, cl.getPaddedNumAtoms(), "longBornSum"); longBornSum = OpenCLArray::create<cl_long>(cl, cl.getPaddedNumAtoms(), "longBornSum");
longBornForce = OpenCLArray::create<cl_long>(cl, cl.getPaddedNumAtoms(), "longBornForce"); longBornForce = OpenCLArray::create<cl_long>(cl, cl.getPaddedNumAtoms(), "longBornForce");
bornForce = OpenCLArray::create<cl_float>(cl, cl.getPaddedNumAtoms(), "bornForce"); bornForce = new OpenCLArray(cl, cl.getPaddedNumAtoms(), elementSize, "bornForce");
cl.addAutoclearBuffer(*longBornSum); cl.addAutoclearBuffer(*longBornSum);
cl.addAutoclearBuffer(*longBornForce); cl.addAutoclearBuffer(*longBornForce);
} }
else { else {
bornSum = OpenCLArray::create<cl_float>(cl, cl.getPaddedNumAtoms()*nb.getNumForceBuffers(), "bornSum"); bornSum = new OpenCLArray(cl, cl.getPaddedNumAtoms()*nb.getNumForceBuffers(), elementSize, "bornSum");
bornForce = OpenCLArray::create<cl_float>(cl, cl.getPaddedNumAtoms()*nb.getNumForceBuffers(), "bornForce"); bornForce = new OpenCLArray(cl, cl.getPaddedNumAtoms()*nb.getNumForceBuffers(), elementSize, "bornForce");
cl.addAutoclearBuffer(*bornSum); cl.addAutoclearBuffer(*bornSum);
cl.addAutoclearBuffer(*bornForce); cl.addAutoclearBuffer(*bornForce);
} }
vector<mm_float4> posq(cl.getPaddedNumAtoms(), mm_float4(0, 0, 0, 0)); vector<mm_float4> posqf(cl.getPaddedNumAtoms());
int numParticles = force.getNumParticles(); vector<mm_double4> posqd(cl.getPaddedNumAtoms());
vector<mm_float2> paramsVector(cl.getPaddedNumAtoms()); vector<mm_float2> paramsVector(cl.getPaddedNumAtoms());
const double dielectricOffset = 0.009; const double dielectricOffset = 0.009;
for (int i = 0; i < numParticles; i++) { for (int i = 0; i < force.getNumParticles(); i++) {
double charge, radius, scalingFactor; double charge, radius, scalingFactor;
force.getParticleParameters(i, charge, radius, scalingFactor); force.getParticleParameters(i, charge, radius, scalingFactor);
radius -= dielectricOffset; radius -= dielectricOffset;
paramsVector[i] = mm_float2((float) radius, (float) (scalingFactor*radius)); paramsVector[i] = mm_float2((float) radius, (float) (scalingFactor*radius));
posq[i].w = (float) charge; if (cl.getUseDoublePrecision())
posqd[i] = mm_double4(0, 0, 0, charge);
else
posqf[i] = mm_float4(0, 0, 0, (float) charge);
} }
cl.getPosq().upload(posq); if (cl.getUseDoublePrecision())
cl.getPosq().upload(posqd);
else
cl.getPosq().upload(posqf);
params->upload(paramsVector); params->upload(paramsVector);
prefactor = -ONE_4PI_EPS0*((1.0/force.getSoluteDielectric())-(1.0/force.getSolventDielectric())); prefactor = -ONE_4PI_EPS0*((1.0/force.getSoluteDielectric())-(1.0/force.getSolventDielectric()));
bool useCutoff = (force.getNonbondedMethod() != GBSAOBCForce::NoCutoff); bool useCutoff = (force.getNonbondedMethod() != GBSAOBCForce::NoCutoff);
...@@ -2040,7 +2047,7 @@ void OpenCLCalcGBSAOBCForceKernel::initialize(const System& system, const GBSAOB ...@@ -2040,7 +2047,7 @@ void OpenCLCalcGBSAOBCForceKernel::initialize(const System& system, const GBSAOB
string source = OpenCLKernelSources::gbsaObc2; string source = OpenCLKernelSources::gbsaObc2;
nb.addInteraction(useCutoff, usePeriodic, false, force.getCutoffDistance(), vector<vector<int> >(), source, force.getForceGroup()); nb.addInteraction(useCutoff, usePeriodic, false, force.getCutoffDistance(), vector<vector<int> >(), source, force.getForceGroup());
nb.addParameter(OpenCLNonbondedUtilities::ParameterInfo("obcParams", "float", 2, sizeof(cl_float2), params->getDeviceBuffer()));; nb.addParameter(OpenCLNonbondedUtilities::ParameterInfo("obcParams", "float", 2, sizeof(cl_float2), params->getDeviceBuffer()));;
nb.addParameter(OpenCLNonbondedUtilities::ParameterInfo("bornForce", "float", 1, sizeof(cl_float), bornForce->getDeviceBuffer()));; nb.addParameter(OpenCLNonbondedUtilities::ParameterInfo("bornForce", "real", 1, elementSize, bornForce->getDeviceBuffer()));;
cl.addForce(new OpenCLGBSAOBCForceInfo(nb.getNumForceBuffers(), force)); cl.addForce(new OpenCLGBSAOBCForceInfo(nb.getNumForceBuffers(), force));
} }
...@@ -2141,10 +2148,10 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF ...@@ -2141,10 +2148,10 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
reduceBornForceKernel.setArg<cl::Buffer>(index++, obcChain->getDeviceBuffer()); reduceBornForceKernel.setArg<cl::Buffer>(index++, obcChain->getDeviceBuffer());
} }
if (nb.getUseCutoff()) { if (nb.getUseCutoff()) {
computeBornSumKernel.setArg<mm_float4>(5, cl.getPeriodicBoxSize()); setPeriodicBoxSizeArg(cl, computeBornSumKernel, 5);
computeBornSumKernel.setArg<mm_float4>(6, cl.getInvPeriodicBoxSize()); setInvPeriodicBoxSizeArg(cl, computeBornSumKernel, 6);
force1Kernel.setArg<mm_float4>(7, cl.getPeriodicBoxSize()); setPeriodicBoxSizeArg(cl, force1Kernel, 7);
force1Kernel.setArg<mm_float4>(8, cl.getInvPeriodicBoxSize()); setInvPeriodicBoxSizeArg(cl, force1Kernel, 8);
if (maxTiles < nb.getInteractingTiles().getSize()) { if (maxTiles < nb.getInteractingTiles().getSize()) {
maxTiles = nb.getInteractingTiles().getSize(); maxTiles = nb.getInteractingTiles().getSize();
computeBornSumKernel.setArg<cl::Buffer>(3, nb.getInteractingTiles().getDeviceBuffer()); computeBornSumKernel.setArg<cl::Buffer>(3, nb.getInteractingTiles().getDeviceBuffer());
...@@ -2174,8 +2181,9 @@ void OpenCLCalcGBSAOBCForceKernel::copyParametersToContext(ContextImpl& context, ...@@ -2174,8 +2181,9 @@ void OpenCLCalcGBSAOBCForceKernel::copyParametersToContext(ContextImpl& context,
// Record the per-particle parameters. // Record the per-particle parameters.
OpenCLArray& posq = cl.getPosq(); OpenCLArray& posq = cl.getPosq();
posq.download((mm_float4*) cl.getPinnedBuffer());
mm_float4* posqf = (mm_float4*) cl.getPinnedBuffer(); mm_float4* posqf = (mm_float4*) cl.getPinnedBuffer();
mm_double4* posqd = (mm_double4*) cl.getPinnedBuffer();
posq.download(cl.getPinnedBuffer());
vector<mm_float2> paramsVector(cl.getPaddedNumAtoms()); vector<mm_float2> paramsVector(cl.getPaddedNumAtoms());
const double dielectricOffset = 0.009; const double dielectricOffset = 0.009;
for (int i = 0; i < numParticles; i++) { for (int i = 0; i < numParticles; i++) {
...@@ -2183,7 +2191,10 @@ void OpenCLCalcGBSAOBCForceKernel::copyParametersToContext(ContextImpl& context, ...@@ -2183,7 +2191,10 @@ void OpenCLCalcGBSAOBCForceKernel::copyParametersToContext(ContextImpl& context,
force.getParticleParameters(i, charge, radius, scalingFactor); force.getParticleParameters(i, charge, radius, scalingFactor);
radius -= dielectricOffset; radius -= dielectricOffset;
paramsVector[i] = mm_float2((float) radius, (float) (scalingFactor*radius)); paramsVector[i] = mm_float2((float) radius, (float) (scalingFactor*radius));
posqf[i].w = (float) charge; if (cl.getUseDoublePrecision())
posqd[i].w = charge;
else
posqf[i].w = (float) charge;
} }
posq.upload(cl.getPinnedBuffer()); posq.upload(cl.getPinnedBuffer());
params->upload(paramsVector); params->upload(paramsVector);
...@@ -2597,14 +2608,14 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo ...@@ -2597,14 +2608,14 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
extraArgs << ", __global long* restrict derivBuffers"; extraArgs << ", __global long* restrict derivBuffers";
for (int i = 0; i < force.getNumComputedValues(); i++) { for (int i = 0; i < force.getNumComputedValues(); i++) {
string index = cl.intToString(i+1); string index = cl.intToString(i+1);
extraArgs << ", __local float* restrict local_deriv" << index; extraArgs << ", __local real* restrict local_deriv" << index;
clearLocal << "local_deriv" << index << "[localAtomIndex] = 0.0f;\n"; clearLocal << "local_deriv" << index << "[localAtomIndex] = 0.0f;\n";
declare1 << "float deriv" << index << "_1 = 0.0f;\n"; declare1 << "real deriv" << index << "_1 = 0;\n";
load2 << "float deriv" << index << "_2 = 0.0f;\n"; load2 << "real deriv" << index << "_2 = 0;\n";
recordDeriv << "local_deriv" << index << "[atom2] += deriv" << index << "_2;\n"; recordDeriv << "local_deriv" << index << "[atom2] += deriv" << index << "_2;\n";
storeDerivs1 << "STORE_DERIVATIVE_1(" << index << ")\n"; storeDerivs1 << "STORE_DERIVATIVE_1(" << index << ")\n";
storeDerivs2 << "STORE_DERIVATIVE_2(" << index << ")\n"; storeDerivs2 << "STORE_DERIVATIVE_2(" << index << ")\n";
declareTemps << "__local float tempDerivBuffer" << index << "[64];\n"; declareTemps << "__local real tempDerivBuffer" << index << "[64];\n";
setTemps << "tempDerivBuffer" << index << "[get_local_id(0)] = deriv" << index << "_1;\n"; setTemps << "tempDerivBuffer" << index << "[get_local_id(0)] = deriv" << index << "_1;\n";
} }
} }
...@@ -2728,14 +2739,14 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo ...@@ -2728,14 +2739,14 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
} }
for (int i = 1; i < force.getNumComputedValues(); i++) for (int i = 1; i < force.getNumComputedValues(); i++)
for (int j = 0; j < i; j++) for (int j = 0; j < i; j++)
expressions["float dV"+cl.intToString(i)+"dV"+cl.intToString(j)+" = "] = valueDerivExpressions[i][j]; expressions["real dV"+cl.intToString(i)+"dV"+cl.intToString(j)+" = "] = valueDerivExpressions[i][j];
compute << cl.getExpressionUtilities().createExpressions(expressions, variables, functionDefinitions, "temp", prefix+"functionParams"); compute << cl.getExpressionUtilities().createExpressions(expressions, variables, functionDefinitions, "temp", prefix+"functionParams");
// Record values. // Record values.
compute << "forceBuffers[index] = forceBuffers[index]+force;\n"; compute << "forceBuffers[index] = forceBuffers[index]+force;\n";
for (int i = 1; i < force.getNumComputedValues(); i++) { for (int i = 1; i < force.getNumComputedValues(); i++) {
compute << "float totalDeriv"<<i<<" = dV"<<i<<"dV0"; compute << "real totalDeriv"<<i<<" = dV"<<i<<"dV0";
for (int j = 1; j < i; j++) for (int j = 1; j < i; j++)
compute << " + totalDeriv"<<j<<"*dV"<<i<<"dV"<<j; compute << " + totalDeriv"<<j<<"*dV"<<i<<"dV"<<j;
compute << ";\n"; compute << ";\n";
...@@ -2789,12 +2800,12 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo ...@@ -2789,12 +2800,12 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
variables[computedValueNames[i]] = "values"+computedValues->getParameterSuffix(i, "[index]"); variables[computedValueNames[i]] = "values"+computedValues->getParameterSuffix(i, "[index]");
for (int i = 1; i < force.getNumComputedValues(); i++) { for (int i = 1; i < force.getNumComputedValues(); i++) {
string is = cl.intToString(i); string is = cl.intToString(i);
compute << "float4 dV"<<is<<"dR = (float4) 0;\n"; compute << "real4 dV"<<is<<"dR = (real4) 0;\n";
for (int j = 1; j < i; j++) { for (int j = 1; j < i; j++) {
if (!isZeroExpression(valueDerivExpressions[i][j])) { if (!isZeroExpression(valueDerivExpressions[i][j])) {
map<string, Lepton::ParsedExpression> derivExpressions; map<string, Lepton::ParsedExpression> derivExpressions;
string js = cl.intToString(j); string js = cl.intToString(j);
derivExpressions["float dV"+is+"dV"+js+" = "] = valueDerivExpressions[i][j]; derivExpressions["real dV"+is+"dV"+js+" = "] = valueDerivExpressions[i][j];
compute << cl.getExpressionUtilities().createExpressions(derivExpressions, variables, functionDefinitions, "temp_"+is+"_"+js, prefix+"functionParams"); compute << cl.getExpressionUtilities().createExpressions(derivExpressions, variables, functionDefinitions, "temp_"+is+"_"+js, prefix+"functionParams");
compute << "dV"<<is<<"dR += dV"<<is<<"dV"<<js<<"*dV"<<js<<"dR;\n"; compute << "dV"<<is<<"dR += dV"<<is<<"dV"<<js<<"*dV"<<js<<"dR;\n";
} }
...@@ -2845,8 +2856,8 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo ...@@ -2845,8 +2856,8 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
map<string, Lepton::ParsedExpression> derivExpressions; map<string, Lepton::ParsedExpression> derivExpressions;
stringstream chainSource; stringstream chainSource;
Lepton::ParsedExpression dVdR = Lepton::Parser::parse(computedValueExpressions[0], functions).differentiate("r").optimize(); Lepton::ParsedExpression dVdR = Lepton::Parser::parse(computedValueExpressions[0], functions).differentiate("r").optimize();
derivExpressions["float dV0dR1 = "] = dVdR; derivExpressions["real dV0dR1 = "] = dVdR;
derivExpressions["float dV0dR2 = "] = dVdR.renameVariables(rename); derivExpressions["real dV0dR2 = "] = dVdR.renameVariables(rename);
chainSource << cl.getExpressionUtilities().createExpressions(derivExpressions, variables, functionDefinitions, prefix+"temp0_", prefix+"functionParams"); chainSource << cl.getExpressionUtilities().createExpressions(derivExpressions, variables, functionDefinitions, prefix+"temp0_", prefix+"functionParams");
if (needChainForValue[0]) { if (needChainForValue[0]) {
if (useExclusionsForValue) if (useExclusionsForValue)
...@@ -2911,6 +2922,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo ...@@ -2911,6 +2922,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) { double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
bool deviceIsCpu = (cl.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU); bool deviceIsCpu = (cl.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
OpenCLNonbondedUtilities& nb = cl.getNonbondedUtilities(); OpenCLNonbondedUtilities& nb = cl.getNonbondedUtilities();
int elementSize = (cl.getUseDoublePrecision() ? sizeof(cl_double) : sizeof(cl_float));
if (!hasInitializedKernels) { if (!hasInitializedKernels) {
hasInitializedKernels = true; hasInitializedKernels = true;
maxTiles = (nb.getUseCutoff() ? nb.getInteractingTiles().getSize() : 0); maxTiles = (nb.getUseCutoff() ? nb.getInteractingTiles().getSize() : 0);
...@@ -2921,21 +2933,21 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include ...@@ -2921,21 +2933,21 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
cl.clearBuffer(*longValueBuffers); cl.clearBuffer(*longValueBuffers);
} }
else { else {
valueBuffers = OpenCLArray::create<cl_float>(cl, cl.getPaddedNumAtoms()*nb.getNumForceBuffers(), "customGBValueBuffers"); valueBuffers = new OpenCLArray(cl, cl.getPaddedNumAtoms()*nb.getNumForceBuffers(), elementSize, "customGBValueBuffers");
cl.addAutoclearBuffer(*valueBuffers); cl.addAutoclearBuffer(*valueBuffers);
cl.clearBuffer(*valueBuffers); cl.clearBuffer(*valueBuffers);
} }
int index = 0; int index = 0;
pairValueKernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer()); pairValueKernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer());
pairValueKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*sizeof(cl_float4), NULL); pairValueKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*4*elementSize, NULL);
pairValueKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusions().getDeviceBuffer()); pairValueKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusions().getDeviceBuffer());
pairValueKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionIndices().getDeviceBuffer()); pairValueKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionIndices().getDeviceBuffer());
pairValueKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionRowIndices().getDeviceBuffer()); pairValueKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionRowIndices().getDeviceBuffer());
pairValueKernel.setArg<cl::Buffer>(index++, useLong ? longValueBuffers->getDeviceBuffer() : valueBuffers->getDeviceBuffer()); pairValueKernel.setArg<cl::Buffer>(index++, useLong ? longValueBuffers->getDeviceBuffer() : valueBuffers->getDeviceBuffer());
pairValueKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*sizeof(cl_float), NULL); pairValueKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*elementSize, NULL);
/// \todo Eliminate this argument and make local to the kernel. For *_default.cl kernel can actually make it TileSize rather than getForceThreadBlockSize as only half the workgroup stores to it as was done with nonbonded_default.cl. /// \todo Eliminate this argument and make local to the kernel. For *_default.cl kernel can actually make it TileSize rather than getForceThreadBlockSize as only half the workgroup stores to it as was done with nonbonded_default.cl.
/// \todo Also make the previous __local argument local as was done with nonbonded_default.cl. /// \todo Also make the previous __local argument local as was done with nonbonded_default.cl.
pairValueKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*sizeof(cl_float), NULL); pairValueKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*elementSize, NULL);
if (nb.getUseCutoff()) { if (nb.getUseCutoff()) {
pairValueKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer()); pairValueKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
pairValueKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer()); pairValueKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
...@@ -2979,9 +2991,9 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include ...@@ -2979,9 +2991,9 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
index = 0; index = 0;
pairEnergyKernel.setArg<cl::Buffer>(index++, useLong ? cl.getLongForceBuffer().getDeviceBuffer() : cl.getForceBuffers().getDeviceBuffer()); pairEnergyKernel.setArg<cl::Buffer>(index++, useLong ? cl.getLongForceBuffer().getDeviceBuffer() : cl.getForceBuffers().getDeviceBuffer());
pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getEnergyBuffer().getDeviceBuffer()); pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getEnergyBuffer().getDeviceBuffer());
pairEnergyKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*sizeof(cl_float4), NULL); pairEnergyKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*4*elementSize, NULL);
pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer()); pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer());
pairEnergyKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*sizeof(cl_float4), NULL); pairEnergyKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*4*elementSize, NULL);
pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusions().getDeviceBuffer()); pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusions().getDeviceBuffer());
pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionIndices().getDeviceBuffer()); pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionIndices().getDeviceBuffer());
pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionRowIndices().getDeviceBuffer()); pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionRowIndices().getDeviceBuffer());
...@@ -3017,7 +3029,7 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include ...@@ -3017,7 +3029,7 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
if (useLong) { if (useLong) {
pairEnergyKernel.setArg<cl::Memory>(index++, longEnergyDerivs->getDeviceBuffer()); pairEnergyKernel.setArg<cl::Memory>(index++, longEnergyDerivs->getDeviceBuffer());
for (int i = 0; i < numComputedValues; ++i) for (int i = 0; i < numComputedValues; ++i)
pairEnergyKernel.setArg(index++, nb.getForceThreadBlockSize()*sizeof(cl_float), NULL); pairEnergyKernel.setArg(index++, nb.getForceThreadBlockSize()*elementSize, NULL);
} }
else { else {
for (int i = 0; i < (int) energyDerivs->getBuffers().size(); i++) { for (int i = 0; i < (int) energyDerivs->getBuffers().size(); i++) {
...@@ -3078,10 +3090,10 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include ...@@ -3078,10 +3090,10 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
globals->upload(globalParamValues); globals->upload(globalParamValues);
} }
if (nb.getUseCutoff()) { if (nb.getUseCutoff()) {
pairValueKernel.setArg<mm_float4>(10, cl.getPeriodicBoxSize()); setPeriodicBoxSizeArg(cl, pairValueKernel, 10);
pairValueKernel.setArg<mm_float4>(11, cl.getInvPeriodicBoxSize()); setInvPeriodicBoxSizeArg(cl, pairValueKernel, 11);
pairEnergyKernel.setArg<mm_float4>(11, cl.getPeriodicBoxSize()); setPeriodicBoxSizeArg(cl, pairEnergyKernel, 11);
pairEnergyKernel.setArg<mm_float4>(12, cl.getInvPeriodicBoxSize()); setInvPeriodicBoxSizeArg(cl, pairEnergyKernel, 12);
if (maxTiles < nb.getInteractingTiles().getSize()) { if (maxTiles < nb.getInteractingTiles().getSize()) {
maxTiles = nb.getInteractingTiles().getSize(); maxTiles = nb.getInteractingTiles().getSize();
pairValueKernel.setArg<cl::Buffer>(8, nb.getInteractingTiles().getDeviceBuffer()); pairValueKernel.setArg<cl::Buffer>(8, nb.getInteractingTiles().getDeviceBuffer());
...@@ -5340,8 +5352,8 @@ void OpenCLApplyMonteCarloBarostatKernel::scaleCoordinates(ContextImpl& context, ...@@ -5340,8 +5352,8 @@ void OpenCLApplyMonteCarloBarostatKernel::scaleCoordinates(ContextImpl& context,
} }
cl.getQueue().enqueueCopyBuffer(cl.getPosq().getDeviceBuffer(), savedPositions->getDeviceBuffer(), 0, 0, cl.getPosq().getSize()*sizeof(mm_float4)); cl.getQueue().enqueueCopyBuffer(cl.getPosq().getDeviceBuffer(), savedPositions->getDeviceBuffer(), 0, 0, cl.getPosq().getSize()*sizeof(mm_float4));
kernel.setArg<cl_float>(0, (cl_float) scale); kernel.setArg<cl_float>(0, (cl_float) scale);
kernel.setArg<mm_float4>(2, cl.getPeriodicBoxSize()); setPeriodicBoxSizeArg(cl, kernel, 2);
kernel.setArg<mm_float4>(3, cl.getInvPeriodicBoxSize()); setInvPeriodicBoxSizeArg(cl, kernel, 3);
cl.executeKernel(kernel, cl.getNumAtoms()); cl.executeKernel(kernel, cl.getNumAtoms());
for (int i = 0; i < (int) cl.getPosCellOffsets().size(); i++) for (int i = 0; i < (int) cl.getPosCellOffsets().size(); i++)
cl.getPosCellOffsets()[i] = mm_int4(0, 0, 0, 0); cl.getPosCellOffsets()[i] = mm_int4(0, 0, 0, 0);
......
...@@ -256,8 +256,9 @@ void OpenCLNonbondedUtilities::initialize(const System& system) { ...@@ -256,8 +256,9 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
interactingTiles = OpenCLArray::create<mm_ushort2>(context, maxInteractingTiles, "interactingTiles"); interactingTiles = OpenCLArray::create<mm_ushort2>(context, maxInteractingTiles, "interactingTiles");
interactionFlags = OpenCLArray::create<cl_uint>(context, context.getSIMDWidth() == 32 ? maxInteractingTiles : (deviceIsCpu ? 2*maxInteractingTiles : 1), "interactionFlags"); interactionFlags = OpenCLArray::create<cl_uint>(context, context.getSIMDWidth() == 32 ? maxInteractingTiles : (deviceIsCpu ? 2*maxInteractingTiles : 1), "interactionFlags");
interactionCount = OpenCLArray::create<cl_uint>(context, 1, "interactionCount"); interactionCount = OpenCLArray::create<cl_uint>(context, 1, "interactionCount");
blockCenter = OpenCLArray::create<mm_float4>(context, numAtomBlocks, "blockCenter"); int elementSize = (context.getUseDoublePrecision() ? sizeof(mm_double4) : sizeof(mm_float4));
blockBoundingBox = OpenCLArray::create<mm_float4>(context, numAtomBlocks, "blockBoundingBox"); blockCenter = new OpenCLArray(context, numAtomBlocks, elementSize, "blockCenter");
blockBoundingBox = new OpenCLArray(context, numAtomBlocks, elementSize, "blockBoundingBox");
vector<cl_uint> count(1, 0); vector<cl_uint> count(1, 0);
interactionCount->upload(count); interactionCount->upload(count);
} }
......
...@@ -4,10 +4,10 @@ if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2 && r2 < CUTOFF_SQUA ...@@ -4,10 +4,10 @@ if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2 && r2 < CUTOFF_SQUA
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) { if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
#endif #endif
#ifdef USE_SYMMETRIC #ifdef USE_SYMMETRIC
float tempForce = 0.0f; real tempForce = 0.0f;
#else #else
float4 tempForce1 = (float4) 0.0f; real4 tempForce1 = (real4) 0;
float4 tempForce2 = (float4) 0.0f; real4 tempForce2 = (real4) 0;
#endif #endif
COMPUTE_FORCE COMPUTE_FORCE
#ifdef USE_SYMMETRIC #ifdef USE_SYMMETRIC
......
...@@ -6,11 +6,11 @@ ...@@ -6,11 +6,11 @@
* Compute a force based on pair interactions. * Compute a force based on pair interactions.
*/ */
__kernel void computeN2Energy(__global float4* restrict forceBuffers, __global float* restrict energyBuffer, __local float4* restrict local_force, __kernel void computeN2Energy(__global real4* restrict forceBuffers, __global real* restrict energyBuffer, __local real4* restrict local_force,
__global const float4* restrict posq, __local float4* restrict local_posq, __global const unsigned int* restrict exclusions, __global const unsigned int* restrict exclusionIndices, __global const real4* restrict posq, __local real4* restrict local_posq, __global const unsigned int* restrict exclusions, __global const unsigned int* restrict exclusionIndices,
__global const unsigned int* restrict exclusionRowIndices, __local float4* restrict tempBuffer, __global const unsigned int* restrict exclusionRowIndices, __local real4* restrict tempBuffer,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
#else #else
unsigned int numTiles unsigned int numTiles
#endif #endif
...@@ -23,7 +23,7 @@ __kernel void computeN2Energy(__global float4* restrict forceBuffers, __global f ...@@ -23,7 +23,7 @@ __kernel void computeN2Energy(__global float4* restrict forceBuffers, __global f
unsigned int pos = get_group_id(0)*numTiles/get_num_groups(0); unsigned int pos = get_group_id(0)*numTiles/get_num_groups(0);
unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0); unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
#endif #endif
float energy = 0.0f; real energy = 0;
unsigned int lasty = 0xFFFFFFFF; unsigned int lasty = 0xFFFFFFFF;
while (pos < end) { while (pos < end) {
...@@ -79,30 +79,30 @@ __kernel void computeN2Energy(__global float4* restrict forceBuffers, __global f ...@@ -79,30 +79,30 @@ __kernel void computeN2Energy(__global float4* restrict forceBuffers, __global f
unsigned int excl = exclusions[exclusionIndex+tgx]; unsigned int excl = exclusions[exclusionIndex+tgx];
#endif #endif
unsigned int atom1 = x*TILE_SIZE+tgx; unsigned int atom1 = x*TILE_SIZE+tgx;
float4 force = 0.0f; real4 force = 0;
DECLARE_ATOM1_DERIVATIVES DECLARE_ATOM1_DERIVATIVES
float4 posq1 = posq[atom1]; real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS LOAD_ATOM1_PARAMETERS
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1); bool isExcluded = !(excl & 0x1);
#endif #endif
float4 posq2 = local_posq[j]; real4 posq2 = local_posq[j];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz; delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif #endif
float r2 = dot(delta.xyz, delta.xyz); real r2 = dot(delta.xyz, delta.xyz);
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) { if (r2 < CUTOFF_SQUARED) {
#endif #endif
float invR = RSQRT(r2); real invR = RSQRT(r2);
float r = RECIP(invR); real r = RECIP(invR);
unsigned int atom2 = j; unsigned int atom2 = j;
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j; atom2 = y*TILE_SIZE+j;
float dEdR = 0.0f; real dEdR = 0;
float tempEnergy = 0.0f; real tempEnergy = 0;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) { if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
COMPUTE_INTERACTION COMPUTE_INTERACTION
dEdR /= -r; dEdR /= -r;
...@@ -129,7 +129,7 @@ __kernel void computeN2Energy(__global float4* restrict forceBuffers, __global f ...@@ -129,7 +129,7 @@ __kernel void computeN2Energy(__global float4* restrict forceBuffers, __global f
// This is an off-diagonal tile. // This is an off-diagonal tile.
for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) { for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
local_force[localAtomIndex] = 0.0f; local_force[localAtomIndex] = 0;
CLEAR_LOCAL_DERIVATIVES CLEAR_LOCAL_DERIVATIVES
} }
#if defined(USE_CUTOFF) && defined(USE_EXCLUSIONS) #if defined(USE_CUTOFF) && defined(USE_EXCLUSIONS)
...@@ -141,26 +141,26 @@ __kernel void computeN2Energy(__global float4* restrict forceBuffers, __global f ...@@ -141,26 +141,26 @@ __kernel void computeN2Energy(__global float4* restrict forceBuffers, __global f
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) { for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
if ((flags2&(1<<tgx)) != 0) { if ((flags2&(1<<tgx)) != 0) {
unsigned int atom1 = x*TILE_SIZE+tgx; unsigned int atom1 = x*TILE_SIZE+tgx;
float value = 0.0f; real value = 0;
DECLARE_ATOM1_DERIVATIVES DECLARE_ATOM1_DERIVATIVES
float4 posq1 = posq[atom1]; real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS LOAD_ATOM1_PARAMETERS
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) { if ((flags&(1<<j)) != 0) {
float4 posq2 = local_posq[j]; real4 posq2 = local_posq[j];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz; delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif #endif
float r2 = dot(delta.xyz, delta.xyz); real r2 = dot(delta.xyz, delta.xyz);
if (r2 < CUTOFF_SQUARED) { if (r2 < CUTOFF_SQUARED) {
float invR = RSQRT(r2); real invR = RSQRT(r2);
float r = RECIP(invR); real r = RECIP(invR);
unsigned int atom2 = j; unsigned int atom2 = j;
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j; atom2 = y*TILE_SIZE+j;
float dEdR = 0.0f; real dEdR = 0;
float tempEnergy = 0.0f; real tempEnergy = 0;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) { if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
COMPUTE_INTERACTION COMPUTE_INTERACTION
dEdR /= -r; dEdR /= -r;
...@@ -189,9 +189,9 @@ __kernel void computeN2Energy(__global float4* restrict forceBuffers, __global f ...@@ -189,9 +189,9 @@ __kernel void computeN2Energy(__global float4* restrict forceBuffers, __global f
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) { for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int atom1 = x*TILE_SIZE+tgx; unsigned int atom1 = x*TILE_SIZE+tgx;
float4 force = 0.0f; real4 force = 0;
DECLARE_ATOM1_DERIVATIVES DECLARE_ATOM1_DERIVATIVES
float4 posq1 = posq[atom1]; real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS LOAD_ATOM1_PARAMETERS
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndex+tgx] : 0xFFFFFFFF); unsigned int excl = (hasExclusions ? exclusions[exclusionIndex+tgx] : 0xFFFFFFFF);
...@@ -200,22 +200,22 @@ __kernel void computeN2Energy(__global float4* restrict forceBuffers, __global f ...@@ -200,22 +200,22 @@ __kernel void computeN2Energy(__global float4* restrict forceBuffers, __global f
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1); bool isExcluded = !(excl & 0x1);
#endif #endif
float4 posq2 = local_posq[j]; real4 posq2 = local_posq[j];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz; delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif #endif
float r2 = dot(delta.xyz, delta.xyz); real r2 = dot(delta.xyz, delta.xyz);
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) { if (r2 < CUTOFF_SQUARED) {
#endif #endif
float invR = RSQRT(r2); real invR = RSQRT(r2);
float r = RECIP(invR); real r = RECIP(invR);
unsigned int atom2 = j; unsigned int atom2 = j;
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j; atom2 = y*TILE_SIZE+j;
float dEdR = 0.0f; real dEdR = 0;
float tempEnergy = 0.0f; real tempEnergy = 0;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) { if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
COMPUTE_INTERACTION COMPUTE_INTERACTION
dEdR /= -r; dEdR /= -r;
......
...@@ -17,13 +17,13 @@ void computeN2Energy( ...@@ -17,13 +17,13 @@ void computeN2Energy(
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict forceBuffers, __global long* restrict forceBuffers,
#else #else
__global float4* restrict forceBuffers, __global real4* restrict forceBuffers,
#endif #endif
__global float* restrict energyBuffer, __local float4* restrict local_force, __global real* restrict energyBuffer, __local real4* restrict local_force,
__global const float4* restrict posq, __local float4* restrict local_posq, __global const unsigned int* restrict exclusions, __global const unsigned int* restrict exclusionIndices, __global const real4* restrict posq, __local real4* restrict local_posq, __global const unsigned int* restrict exclusions, __global const unsigned int* restrict exclusionIndices,
__global const unsigned int* restrict exclusionRowIndices, __local float4* restrict tempForceBuffer, __global const unsigned int* restrict exclusionRowIndices, __local real4* restrict tempForceBuffer,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles
#else #else
unsigned int numTiles unsigned int numTiles
#endif #endif
...@@ -36,7 +36,7 @@ void computeN2Energy( ...@@ -36,7 +36,7 @@ void computeN2Energy(
unsigned int pos = get_group_id(0)*numTiles/get_num_groups(0); unsigned int pos = get_group_id(0)*numTiles/get_num_groups(0);
unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0); unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
#endif #endif
float energy = 0.0f; real energy = 0;
unsigned int lasty = 0xFFFFFFFF; unsigned int lasty = 0xFFFFFFFF;
__local unsigned int exclusionRange[2]; __local unsigned int exclusionRange[2];
__local int exclusionIndex[1]; __local int exclusionIndex[1];
...@@ -65,9 +65,9 @@ void computeN2Energy( ...@@ -65,9 +65,9 @@ void computeN2Energy(
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int forceBufferOffset = (tgx < TILE_SIZE/2 ? 0 : TILE_SIZE); unsigned int forceBufferOffset = (tgx < TILE_SIZE/2 ? 0 : TILE_SIZE);
unsigned int atom1 = x*TILE_SIZE + tgx; unsigned int atom1 = x*TILE_SIZE + tgx;
float4 force = 0.0f; real4 force = 0;
DECLARE_ATOM1_DERIVATIVES DECLARE_ATOM1_DERIVATIVES
float4 posq1 = posq[atom1]; real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile. // Locate the exclusion data for this tile.
...@@ -99,23 +99,23 @@ void computeN2Energy( ...@@ -99,23 +99,23 @@ void computeN2Energy(
bool isExcluded = !(excl & 0x1); bool isExcluded = !(excl & 0x1);
#endif #endif
int atom2 = baseLocalAtom+j; int atom2 = baseLocalAtom+j;
float4 posq2 = local_posq[atom2]; real4 posq2 = local_posq[atom2];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z; delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif #endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) { if (r2 < CUTOFF_SQUARED) {
#endif #endif
float invR = RSQRT(r2); real invR = RSQRT(r2);
float r = RECIP(invR); real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+baseLocalAtom+j; atom2 = y*TILE_SIZE+baseLocalAtom+j;
float dEdR = 0.0f; real dEdR = 0;
float tempEnergy = 0.0f; real tempEnergy = 0;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) { if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
COMPUTE_INTERACTION COMPUTE_INTERACTION
dEdR /= -r; dEdR /= -r;
...@@ -164,7 +164,7 @@ void computeN2Energy( ...@@ -164,7 +164,7 @@ void computeN2Energy(
local_posq[localAtomIndex] = posq[j]; local_posq[localAtomIndex] = posq[j];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
} }
local_force[localAtomIndex] = 0.0f; local_force[localAtomIndex] = 0;
CLEAR_LOCAL_DERIVATIVES CLEAR_LOCAL_DERIVATIVES
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
...@@ -182,23 +182,23 @@ void computeN2Energy( ...@@ -182,23 +182,23 @@ void computeN2Energy(
bool isExcluded = !(excl & 0x1); bool isExcluded = !(excl & 0x1);
#endif #endif
int atom2 = baseLocalAtom+tj; int atom2 = baseLocalAtom+tj;
float4 posq2 = local_posq[atom2]; real4 posq2 = local_posq[atom2];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z; delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif #endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) { if (r2 < CUTOFF_SQUARED) {
#endif #endif
float invR = RSQRT(r2); real invR = RSQRT(r2);
float r = RECIP(invR); real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+baseLocalAtom+tj; atom2 = y*TILE_SIZE+baseLocalAtom+tj;
float dEdR = 0.0f; real dEdR = 0;
float tempEnergy = 0.0f; real tempEnergy = 0;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) { if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
COMPUTE_INTERACTION COMPUTE_INTERACTION
dEdR /= -r; dEdR /= -r;
......
...@@ -16,13 +16,13 @@ __kernel void computeN2Energy( ...@@ -16,13 +16,13 @@ __kernel void computeN2Energy(
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict forceBuffers, __global long* restrict forceBuffers,
#else #else
__global float4* restrict forceBuffers, __global real4* restrict forceBuffers,
#endif #endif
__global float* restrict energyBuffer, __local float4* restrict local_force, __global real* restrict energyBuffer, __local real4* restrict local_force,
__global const float4* restrict posq, __local float4* restrict local_posq, __global const unsigned int* restrict exclusions, __global const unsigned int* restrict exclusionIndices, __global const real4* restrict posq, __local real4* restrict local_posq, __global const unsigned int* restrict exclusions, __global const unsigned int* restrict exclusionIndices,
__global const unsigned int* restrict exclusionRowIndices, __local float4* restrict tempBuffer, __global const unsigned int* restrict exclusionRowIndices, __local real4* restrict tempBuffer,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
#else #else
unsigned int numTiles unsigned int numTiles
#endif #endif
...@@ -37,7 +37,7 @@ __kernel void computeN2Energy( ...@@ -37,7 +37,7 @@ __kernel void computeN2Energy(
unsigned int pos = warp*numTiles/totalWarps; unsigned int pos = warp*numTiles/totalWarps;
unsigned int end = (warp+1)*numTiles/totalWarps; unsigned int end = (warp+1)*numTiles/totalWarps;
#endif #endif
float energy = 0.0f; real energy = 0;
unsigned int lasty = 0xFFFFFFFF; unsigned int lasty = 0xFFFFFFFF;
__local unsigned int exclusionRange[2*WARPS_PER_GROUP]; __local unsigned int exclusionRange[2*WARPS_PER_GROUP];
__local int exclusionIndex[WARPS_PER_GROUP]; __local int exclusionIndex[WARPS_PER_GROUP];
...@@ -49,7 +49,7 @@ __kernel void computeN2Energy( ...@@ -49,7 +49,7 @@ __kernel void computeN2Energy(
const unsigned int tbx = get_local_id(0) - tgx; const unsigned int tbx = get_local_id(0) - tgx;
const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE; const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
unsigned int x, y; unsigned int x, y;
float4 force = 0.0f; real4 force = 0;
DECLARE_ATOM1_DERIVATIVES DECLARE_ATOM1_DERIVATIVES
if (pos < end) { if (pos < end) {
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
...@@ -69,7 +69,7 @@ __kernel void computeN2Energy( ...@@ -69,7 +69,7 @@ __kernel void computeN2Energy(
} }
} }
unsigned int atom1 = x*TILE_SIZE + tgx; unsigned int atom1 = x*TILE_SIZE + tgx;
float4 posq1 = posq[atom1]; real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile. // Locate the exclusion data for this tile.
...@@ -102,23 +102,23 @@ __kernel void computeN2Energy( ...@@ -102,23 +102,23 @@ __kernel void computeN2Energy(
bool isExcluded = !(excl & 0x1); bool isExcluded = !(excl & 0x1);
#endif #endif
int atom2 = tbx+j; int atom2 = tbx+j;
float4 posq2 = local_posq[atom2]; real4 posq2 = local_posq[atom2];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z; delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif #endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) { if (r2 < CUTOFF_SQUARED) {
#endif #endif
float invR = RSQRT(r2); real invR = RSQRT(r2);
float r = RECIP(invR); real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j; atom2 = y*TILE_SIZE+j;
float dEdR = 0.0f; real dEdR = 0;
float tempEnergy = 0.0f; real tempEnergy = 0;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) { if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
COMPUTE_INTERACTION COMPUTE_INTERACTION
dEdR /= -r; dEdR /= -r;
...@@ -143,7 +143,7 @@ __kernel void computeN2Energy( ...@@ -143,7 +143,7 @@ __kernel void computeN2Energy(
local_posq[localAtomIndex] = posq[j]; local_posq[localAtomIndex] = posq[j];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
} }
local_force[localAtomIndex] = 0.0f; local_force[localAtomIndex] = 0;
CLEAR_LOCAL_DERIVATIVES CLEAR_LOCAL_DERIVATIVES
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF); unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
...@@ -165,23 +165,23 @@ __kernel void computeN2Energy( ...@@ -165,23 +165,23 @@ __kernel void computeN2Energy(
bool isExcluded = !(excl & 0x1); bool isExcluded = !(excl & 0x1);
#endif #endif
int atom2 = tbx+tj; int atom2 = tbx+tj;
float4 posq2 = local_posq[atom2]; real4 posq2 = local_posq[atom2];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z; delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif #endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) { if (r2 < CUTOFF_SQUARED) {
#endif #endif
float invR = RSQRT(r2); real invR = RSQRT(r2);
float r = RECIP(invR); real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+tj; atom2 = y*TILE_SIZE+tj;
float dEdR = 0.0f; real dEdR = 0;
float tempEnergy = 0.0f; real tempEnergy = 0;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) { if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
COMPUTE_INTERACTION COMPUTE_INTERACTION
dEdR /= -r; dEdR /= -r;
......
...@@ -9,9 +9,9 @@ ...@@ -9,9 +9,9 @@
* Reduce the derivatives computed in the N^2 energy kernel, and compute all per-particle energy terms. * Reduce the derivatives computed in the N^2 energy kernel, and compute all per-particle energy terms.
*/ */
__kernel void computePerParticleEnergy(int bufferSize, int numBuffers, __global float4* restrict forceBuffers, __global float* restrict energyBuffer, __global const float4* restrict posq __kernel void computePerParticleEnergy(int bufferSize, int numBuffers, __global real4* restrict forceBuffers, __global real* restrict energyBuffer, __global const real4* restrict posq
PARAMETER_ARGUMENTS) { PARAMETER_ARGUMENTS) {
float energy = 0.0f; real energy = 0;
unsigned int index = get_global_id(0); unsigned int index = get_global_id(0);
while (index < NUM_ATOMS) { while (index < NUM_ATOMS) {
// Reduce the derivatives // Reduce the derivatives
...@@ -21,8 +21,8 @@ __kernel void computePerParticleEnergy(int bufferSize, int numBuffers, __global ...@@ -21,8 +21,8 @@ __kernel void computePerParticleEnergy(int bufferSize, int numBuffers, __global
// Now calculate the per-particle energy terms. // Now calculate the per-particle energy terms.
float4 pos = posq[index]; real4 pos = posq[index];
float4 force = (float4) 0.0f; real4 force = (real4) 0;
COMPUTE_ENERGY COMPUTE_ENERGY
index += get_global_size(0); index += get_global_size(0);
} }
......
...@@ -2,12 +2,12 @@ ...@@ -2,12 +2,12 @@
* Compute chain rule terms for computed values that depend explicitly on particle coordinates. * Compute chain rule terms for computed values that depend explicitly on particle coordinates.
*/ */
__kernel void computeGradientChainRuleTerms(__global float4* restrict forceBuffers, __global const float4* restrict posq __kernel void computeGradientChainRuleTerms(__global real4* restrict forceBuffers, __global const real4* restrict posq
PARAMETER_ARGUMENTS) { PARAMETER_ARGUMENTS) {
unsigned int index = get_global_id(0); unsigned int index = get_global_id(0);
while (index < NUM_ATOMS) { while (index < NUM_ATOMS) {
float4 pos = posq[index]; real4 pos = posq[index];
float4 force = forceBuffers[index]; real4 force = forceBuffers[index];
COMPUTE_FORCES COMPUTE_FORCES
forceBuffers[index] = force; forceBuffers[index] = force;
index += get_global_size(0); index += get_global_size(0);
......
...@@ -4,11 +4,11 @@ ...@@ -4,11 +4,11 @@
* Compute a value based on pair interactions. * Compute a value based on pair interactions.
*/ */
__kernel void computeN2Value(__global const float4* restrict posq, __local float4* restrict local_posq, __global const unsigned int* restrict exclusions, __kernel void computeN2Value(__global const real4* restrict posq, __local real4* restrict local_posq, __global const unsigned int* restrict exclusions,
__global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __global float* restrict global_value, __local float* restrict local_value, __global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __global real* restrict global_value, __local real* restrict local_value,
__local float* restrict tempBuffer, __local real* restrict tempBuffer,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
#else #else
unsigned int numTiles unsigned int numTiles
#endif #endif
...@@ -76,29 +76,29 @@ __kernel void computeN2Value(__global const float4* restrict posq, __local float ...@@ -76,29 +76,29 @@ __kernel void computeN2Value(__global const float4* restrict posq, __local float
unsigned int excl = exclusions[exclusionIndex+tgx]; unsigned int excl = exclusions[exclusionIndex+tgx];
#endif #endif
unsigned int atom1 = x*TILE_SIZE+tgx; unsigned int atom1 = x*TILE_SIZE+tgx;
float value = 0.0f; real value = 0;
float4 posq1 = posq[atom1]; real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS LOAD_ATOM1_PARAMETERS
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1); bool isExcluded = !(excl & 0x1);
#endif #endif
float4 posq2 = local_posq[j]; real4 posq2 = local_posq[j];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz; delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif #endif
float r2 = dot(delta.xyz, delta.xyz); real r2 = dot(delta.xyz, delta.xyz);
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) { if (r2 < CUTOFF_SQUARED) {
#endif #endif
float invR = RSQRT(r2); real invR = RSQRT(r2);
float r = RECIP(invR); real r = RECIP(invR);
unsigned int atom2 = j; unsigned int atom2 = j;
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j; atom2 = y*TILE_SIZE+j;
float tempValue1 = 0.0f; real tempValue1 = 0;
float tempValue2 = 0.0f; real tempValue2 = 0;
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) { if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
#else #else
...@@ -125,7 +125,7 @@ __kernel void computeN2Value(__global const float4* restrict posq, __local float ...@@ -125,7 +125,7 @@ __kernel void computeN2Value(__global const float4* restrict posq, __local float
// This is an off-diagonal tile. // This is an off-diagonal tile.
for (int tgx = 0; tgx < TILE_SIZE; tgx++) for (int tgx = 0; tgx < TILE_SIZE; tgx++)
local_value[tgx] = 0.0f; local_value[tgx] = 0;
#if defined(USE_CUTOFF) && defined(USE_EXCLUSIONS) #if defined(USE_CUTOFF) && defined(USE_EXCLUSIONS)
unsigned int flags1 = (numTiles <= maxTiles ? interactionFlags[2*pos] : 0xFFFFFFFF); unsigned int flags1 = (numTiles <= maxTiles ? interactionFlags[2*pos] : 0xFFFFFFFF);
unsigned int flags2 = (numTiles <= maxTiles ? interactionFlags[2*pos+1] : 0xFFFFFFFF); unsigned int flags2 = (numTiles <= maxTiles ? interactionFlags[2*pos+1] : 0xFFFFFFFF);
...@@ -135,22 +135,22 @@ __kernel void computeN2Value(__global const float4* restrict posq, __local float ...@@ -135,22 +135,22 @@ __kernel void computeN2Value(__global const float4* restrict posq, __local float
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) { for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
if ((flags2&(1<<tgx)) != 0) { if ((flags2&(1<<tgx)) != 0) {
unsigned int atom1 = x*TILE_SIZE+tgx; unsigned int atom1 = x*TILE_SIZE+tgx;
float value = 0.0f; real value = 0;
float4 posq1 = posq[atom1]; real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS LOAD_ATOM1_PARAMETERS
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) { if ((flags&(1<<j)) != 0) {
float4 posq2 = local_posq[j]; real4 posq2 = local_posq[j];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz; delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif #endif
float r2 = dot(delta.xyz, delta.xyz); real r2 = dot(delta.xyz, delta.xyz);
float tempValue1 = 0.0f; real tempValue1 = 0;
float tempValue2 = 0.0f; real tempValue2 = 0;
if (r2 < CUTOFF_SQUARED) { if (r2 < CUTOFF_SQUARED) {
float invR = RSQRT(r2); real invR = RSQRT(r2);
float r = RECIP(invR); real r = RECIP(invR);
unsigned int atom2 = j; unsigned int atom2 = j;
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j; atom2 = y*TILE_SIZE+j;
...@@ -177,8 +177,8 @@ __kernel void computeN2Value(__global const float4* restrict posq, __local float ...@@ -177,8 +177,8 @@ __kernel void computeN2Value(__global const float4* restrict posq, __local float
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) { for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int atom1 = x*TILE_SIZE+tgx; unsigned int atom1 = x*TILE_SIZE+tgx;
float value = 0.0f; real value = 0;
float4 posq1 = posq[atom1]; real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS LOAD_ATOM1_PARAMETERS
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndex+tgx] : 0xFFFFFFFF); unsigned int excl = (hasExclusions ? exclusions[exclusionIndex+tgx] : 0xFFFFFFFF);
...@@ -187,22 +187,22 @@ __kernel void computeN2Value(__global const float4* restrict posq, __local float ...@@ -187,22 +187,22 @@ __kernel void computeN2Value(__global const float4* restrict posq, __local float
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1); bool isExcluded = !(excl & 0x1);
#endif #endif
float4 posq2 = local_posq[j]; real4 posq2 = local_posq[j];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz; delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif #endif
float r2 = dot(delta.xyz, delta.xyz); real r2 = dot(delta.xyz, delta.xyz);
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) { if (r2 < CUTOFF_SQUARED) {
#endif #endif
float invR = RSQRT(r2); real invR = RSQRT(r2);
float r = RECIP(invR); real r = RECIP(invR);
unsigned int atom2 = j; unsigned int atom2 = j;
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j; atom2 = y*TILE_SIZE+j;
float tempValue1 = 0.0f; real tempValue1 = 0;
float tempValue2 = 0.0f; real tempValue2 = 0;
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) { if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
#else #else
......
...@@ -9,17 +9,17 @@ ...@@ -9,17 +9,17 @@
*/ */
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1))) __kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeN2Value(__global const float4* restrict posq, __local float4* restrict local_posq, __global const unsigned int* restrict exclusions, void computeN2Value(__global const real4* restrict posq, __local real4* restrict local_posq, __global const unsigned int* restrict exclusions,
__global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices,
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict global_value, __global long* restrict global_value,
#else #else
__global float* restrict global_value, __global real* restrict global_value,
#endif #endif
__local float* restrict local_value, __local real* restrict local_value,
__local float* restrict tempBuffer, __local real* restrict tempBuffer,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles
#else #else
unsigned int numTiles unsigned int numTiles
#endif #endif
...@@ -32,7 +32,7 @@ void computeN2Value(__global const float4* restrict posq, __local float4* restri ...@@ -32,7 +32,7 @@ void computeN2Value(__global const float4* restrict posq, __local float4* restri
unsigned int pos = get_group_id(0)*numTiles/get_num_groups(0); unsigned int pos = get_group_id(0)*numTiles/get_num_groups(0);
unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0); unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
#endif #endif
float energy = 0.0f; real energy = 0;
unsigned int lasty = 0xFFFFFFFF; unsigned int lasty = 0xFFFFFFFF;
__local unsigned int exclusionRange[2]; __local unsigned int exclusionRange[2];
__local int exclusionIndex[1]; __local int exclusionIndex[1];
...@@ -60,8 +60,8 @@ void computeN2Value(__global const float4* restrict posq, __local float4* restri ...@@ -60,8 +60,8 @@ void computeN2Value(__global const float4* restrict posq, __local float4* restri
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int valueBufferOffset = (tgx < TILE_SIZE/2 ? 0 : TILE_SIZE); unsigned int valueBufferOffset = (tgx < TILE_SIZE/2 ? 0 : TILE_SIZE);
unsigned int atom1 = x*TILE_SIZE + tgx; unsigned int atom1 = x*TILE_SIZE + tgx;
float value = 0.0f; real value = 0;
float4 posq1 = posq[atom1]; real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile. // Locate the exclusion data for this tile.
...@@ -93,23 +93,23 @@ void computeN2Value(__global const float4* restrict posq, __local float4* restri ...@@ -93,23 +93,23 @@ void computeN2Value(__global const float4* restrict posq, __local float4* restri
bool isExcluded = !(excl & 0x1); bool isExcluded = !(excl & 0x1);
#endif #endif
int atom2 = baseLocalAtom+j; int atom2 = baseLocalAtom+j;
float4 posq2 = local_posq[atom2]; real4 posq2 = local_posq[atom2];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z; delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif #endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) { if (r2 < CUTOFF_SQUARED) {
#endif #endif
float invR = RSQRT(r2); real invR = RSQRT(r2);
float r = RECIP(invR); real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+baseLocalAtom+j; atom2 = y*TILE_SIZE+baseLocalAtom+j;
float tempValue1 = 0.0f; real tempValue1 = 0;
float tempValue2 = 0.0f; real tempValue2 = 0;
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) { if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
#else #else
...@@ -154,7 +154,7 @@ void computeN2Value(__global const float4* restrict posq, __local float4* restri ...@@ -154,7 +154,7 @@ void computeN2Value(__global const float4* restrict posq, __local float4* restri
const unsigned int localAtomIndex = get_local_id(0); const unsigned int localAtomIndex = get_local_id(0);
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
} }
local_value[get_local_id(0)] = 0.0f; local_value[get_local_id(0)] = 0;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
// Compute the full set of interactions in this tile. // Compute the full set of interactions in this tile.
...@@ -171,23 +171,23 @@ void computeN2Value(__global const float4* restrict posq, __local float4* restri ...@@ -171,23 +171,23 @@ void computeN2Value(__global const float4* restrict posq, __local float4* restri
bool isExcluded = !(excl & 0x1); bool isExcluded = !(excl & 0x1);
#endif #endif
int atom2 = baseLocalAtom+tj; int atom2 = baseLocalAtom+tj;
float4 posq2 = local_posq[atom2]; real4 posq2 = local_posq[atom2];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z; delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif #endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) { if (r2 < CUTOFF_SQUARED) {
#endif #endif
float invR = RSQRT(r2); real invR = RSQRT(r2);
float r = RECIP(invR); real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+baseLocalAtom+tj; atom2 = y*TILE_SIZE+baseLocalAtom+tj;
float tempValue1 = 0.0f; real tempValue1 = 0;
float tempValue2 = 0.0f; real tempValue2 = 0;
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) { if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
#else #else
......
...@@ -7,16 +7,16 @@ ...@@ -7,16 +7,16 @@
/** /**
* Compute a value based on pair interactions. * Compute a value based on pair interactions.
*/ */
__kernel void computeN2Value(__global const float4* restrict posq, __local float4* restrict local_posq, __global const unsigned int* restrict exclusions, __kernel void computeN2Value(__global const real4* restrict posq, __local real4* restrict local_posq, __global const unsigned int* restrict exclusions,
__global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices,
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict global_value, __global long* restrict global_value,
#else #else
__global float* restrict global_value, __global real* restrict global_value,
#endif #endif
__local float* restrict local_value, __local float* restrict tempBuffer, __local real* restrict local_value, __local real* restrict tempBuffer,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
#else #else
unsigned int numTiles unsigned int numTiles
#endif #endif
...@@ -31,7 +31,7 @@ __kernel void computeN2Value(__global const float4* restrict posq, __local float ...@@ -31,7 +31,7 @@ __kernel void computeN2Value(__global const float4* restrict posq, __local float
unsigned int pos = warp*numTiles/totalWarps; unsigned int pos = warp*numTiles/totalWarps;
unsigned int end = (warp+1)*numTiles/totalWarps; unsigned int end = (warp+1)*numTiles/totalWarps;
#endif #endif
float energy = 0.0f; real energy = 0;
unsigned int lasty = 0xFFFFFFFF; unsigned int lasty = 0xFFFFFFFF;
__local unsigned int exclusionRange[2*WARPS_PER_GROUP]; __local unsigned int exclusionRange[2*WARPS_PER_GROUP];
__local int exclusionIndex[WARPS_PER_GROUP]; __local int exclusionIndex[WARPS_PER_GROUP];
...@@ -43,7 +43,7 @@ __kernel void computeN2Value(__global const float4* restrict posq, __local float ...@@ -43,7 +43,7 @@ __kernel void computeN2Value(__global const float4* restrict posq, __local float
const unsigned int tbx = get_local_id(0) - tgx; const unsigned int tbx = get_local_id(0) - tgx;
const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE; const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
unsigned int x, y; unsigned int x, y;
float value = 0.0f; real value = 0;
if (pos < end) { if (pos < end) {
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (numTiles <= maxTiles) { if (numTiles <= maxTiles) {
...@@ -62,7 +62,7 @@ __kernel void computeN2Value(__global const float4* restrict posq, __local float ...@@ -62,7 +62,7 @@ __kernel void computeN2Value(__global const float4* restrict posq, __local float
} }
} }
unsigned int atom1 = x*TILE_SIZE + tgx; unsigned int atom1 = x*TILE_SIZE + tgx;
float4 posq1 = posq[atom1]; real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile. // Locate the exclusion data for this tile.
...@@ -95,23 +95,23 @@ __kernel void computeN2Value(__global const float4* restrict posq, __local float ...@@ -95,23 +95,23 @@ __kernel void computeN2Value(__global const float4* restrict posq, __local float
bool isExcluded = !(excl & 0x1); bool isExcluded = !(excl & 0x1);
#endif #endif
int atom2 = tbx+j; int atom2 = tbx+j;
float4 posq2 = local_posq[atom2]; real4 posq2 = local_posq[atom2];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z; delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif #endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) { if (r2 < CUTOFF_SQUARED) {
#endif #endif
float invR = RSQRT(r2); real invR = RSQRT(r2);
float r = RECIP(invR); real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j; atom2 = y*TILE_SIZE+j;
float tempValue1 = 0.0f; real tempValue1 = 0;
float tempValue2 = 0.0f; real tempValue2 = 0;
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) { if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
#else #else
...@@ -137,7 +137,7 @@ __kernel void computeN2Value(__global const float4* restrict posq, __local float ...@@ -137,7 +137,7 @@ __kernel void computeN2Value(__global const float4* restrict posq, __local float
const unsigned int localAtomIndex = get_local_id(0); const unsigned int localAtomIndex = get_local_id(0);
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
} }
local_value[get_local_id(0)] = 0.0f; local_value[get_local_id(0)] = 0;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF); unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
if (!hasExclusions && flags != 0xFFFFFFFF) { if (!hasExclusions && flags != 0xFFFFFFFF) {
...@@ -150,19 +150,19 @@ __kernel void computeN2Value(__global const float4* restrict posq, __local float ...@@ -150,19 +150,19 @@ __kernel void computeN2Value(__global const float4* restrict posq, __local float
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) { if ((flags&(1<<j)) != 0) {
int atom2 = tbx+j; int atom2 = tbx+j;
float4 posq2 = local_posq[atom2]; real4 posq2 = local_posq[atom2];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z; delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif #endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float tempValue1 = 0.0f; real tempValue1 = 0;
float tempValue2 = 0.0f; real tempValue2 = 0;
if (r2 < CUTOFF_SQUARED) { if (r2 < CUTOFF_SQUARED) {
float invR = RSQRT(r2); real invR = RSQRT(r2);
float r = RECIP(invR); real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j; atom2 = y*TILE_SIZE+j;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) { if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
...@@ -197,23 +197,23 @@ __kernel void computeN2Value(__global const float4* restrict posq, __local float ...@@ -197,23 +197,23 @@ __kernel void computeN2Value(__global const float4* restrict posq, __local float
bool isExcluded = !(excl & 0x1); bool isExcluded = !(excl & 0x1);
#endif #endif
int atom2 = tbx+tj; int atom2 = tbx+tj;
float4 posq2 = local_posq[atom2]; real4 posq2 = local_posq[atom2];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z; delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif #endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) { if (r2 < CUTOFF_SQUARED) {
#endif #endif
float invR = RSQRT(r2); real invR = RSQRT(r2);
float r = RECIP(invR); real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+tj; atom2 = y*TILE_SIZE+tj;
float tempValue1 = 0.0f; real tempValue1 = 0;
float tempValue2 = 0.0f; real tempValue2 = 0;
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) { if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
#else #else
......
...@@ -2,11 +2,11 @@ ...@@ -2,11 +2,11 @@
* Reduce a pairwise computed value, and compute per-particle values. * Reduce a pairwise computed value, and compute per-particle values.
*/ */
__kernel void computePerParticleValues(int bufferSize, int numBuffers, __global float4* posq, __kernel void computePerParticleValues(int bufferSize, int numBuffers, __global real4* posq,
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
__global long* valueBuffers __global long* valueBuffers
#else #else
__global float* valueBuffers __global real* valueBuffers
#endif #endif
PARAMETER_ARGUMENTS) { PARAMETER_ARGUMENTS) {
unsigned int index = get_global_id(0); unsigned int index = get_global_id(0);
...@@ -14,17 +14,17 @@ __kernel void computePerParticleValues(int bufferSize, int numBuffers, __global ...@@ -14,17 +14,17 @@ __kernel void computePerParticleValues(int bufferSize, int numBuffers, __global
// Reduce the pairwise value // Reduce the pairwise value
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
float sum = (1.0f/0xFFFFFFFF)*valueBuffers[index]; real sum = (1.0f/0xFFFFFFFF)*valueBuffers[index];
#else #else
int totalSize = bufferSize*numBuffers; int totalSize = bufferSize*numBuffers;
float sum = valueBuffers[index]; real sum = valueBuffers[index];
for (int i = index+bufferSize; i < totalSize; i += bufferSize) for (int i = index+bufferSize; i < totalSize; i += bufferSize)
sum += valueBuffers[i]; sum += valueBuffers[i];
#endif #endif
// Now calculate other values // Now calculate other values
float4 pos = posq[index]; real4 pos = posq[index];
COMPUTE_VALUES COMPUTE_VALUES
index += get_global_size(0); index += get_global_size(0);
} }
......
...@@ -3,7 +3,7 @@ if (!isExcluded && r2 < CUTOFF_SQUARED) { ...@@ -3,7 +3,7 @@ if (!isExcluded && r2 < CUTOFF_SQUARED) {
#else #else
if (!isExcluded) { if (!isExcluded) {
#endif #endif
float tempForce = 0.0f; real tempForce = 0.0f;
COMPUTE_FORCE COMPUTE_FORCE
dEdR += tempForce*invR; dEdR += tempForce*invR;
} }
{ {
float invRSquaredOver4 = 0.25f*invR*invR; real invRSquaredOver4 = 0.25f*invR*invR;
float rScaledRadiusJ = r+obcParams2.y; real rScaledRadiusJ = r+obcParams2.y;
float rScaledRadiusI = r+obcParams1.y; real rScaledRadiusI = r+obcParams1.y;
float l_ijJ = RECIP(max(obcParams1.x, fabs(r-obcParams2.y))); real l_ijJ = RECIP(max((real) obcParams1.x, fabs(r-obcParams2.y)));
float l_ijI = RECIP(max(obcParams2.x, fabs(r-obcParams1.y))); real l_ijI = RECIP(max((real) obcParams2.x, fabs(r-obcParams1.y)));
float u_ijJ = RECIP(rScaledRadiusJ); real u_ijJ = RECIP(rScaledRadiusJ);
float u_ijI = RECIP(rScaledRadiusI); real u_ijI = RECIP(rScaledRadiusI);
float l_ij2J = l_ijJ*l_ijJ; real l_ij2J = l_ijJ*l_ijJ;
float l_ij2I = l_ijI*l_ijI; real l_ij2I = l_ijI*l_ijI;
float u_ij2J = u_ijJ*u_ijJ; real u_ij2J = u_ijJ*u_ijJ;
float u_ij2I = u_ijI*u_ijI; real u_ij2I = u_ijI*u_ijI;
float t1J = LOG(u_ijJ*RECIP(l_ijJ)); real t1J = LOG(u_ijJ*RECIP(l_ijJ));
float t1I = LOG(u_ijI*RECIP(l_ijI)); real t1I = LOG(u_ijI*RECIP(l_ijI));
float t2J = (l_ij2J-u_ij2J); real t2J = (l_ij2J-u_ij2J);
float t2I = (l_ij2I-u_ij2I); real t2I = (l_ij2I-u_ij2I);
float term1 = (0.5f*(0.25f+obcParams2.y*obcParams2.y*invRSquaredOver4)*t2J + t1J*invRSquaredOver4)*invR; real term1 = (0.5f*(0.25f+obcParams2.y*obcParams2.y*invRSquaredOver4)*t2J + t1J*invRSquaredOver4)*invR;
float term2 = (0.5f*(0.25f+obcParams1.y*obcParams1.y*invRSquaredOver4)*t2I + t1I*invRSquaredOver4)*invR; real term2 = (0.5f*(0.25f+obcParams1.y*obcParams1.y*invRSquaredOver4)*t2I + t1I*invRSquaredOver4)*invR;
float tempdEdR = select(0.0f, bornForce1*term1, obcParams1.x < rScaledRadiusJ); real tempdEdR = (obcParams1.x < rScaledRadiusJ ? bornForce1*term1 : (real) 0);
tempdEdR += select(0.0f, bornForce2*term2, obcParams2.x < rScaledRadiusI); tempdEdR += (obcParams2.x < rScaledRadiusI ? bornForce2*term2 : (real) 0);
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int includeInteraction = (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2 && r2 < CUTOFF_SQUARED); bool includeInteraction = (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2 && r2 < CUTOFF_SQUARED);
#else #else
unsigned int includeInteraction = (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2); bool includeInteraction = (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2);
#endif #endif
dEdR += select(0.0f, tempdEdR, includeInteraction); dEdR += (includeInteraction ? tempdEdR : (real) 0);
} }
...@@ -10,18 +10,18 @@ __kernel void reduceBornSum(int bufferSize, int numBuffers, float alpha, float b ...@@ -10,18 +10,18 @@ __kernel void reduceBornSum(int bufferSize, int numBuffers, float alpha, float b
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
__global const long* restrict bornSum, __global const long* restrict bornSum,
#else #else
__global const float* restrict bornSum, __global const real* restrict bornSum,
#endif #endif
__global const float2* restrict params, __global float* restrict bornRadii, __global float* restrict obcChain) { __global const float2* restrict params, __global real* restrict bornRadii, __global real* restrict obcChain) {
unsigned int index = get_global_id(0); unsigned int index = get_global_id(0);
while (index < NUM_ATOMS) { while (index < NUM_ATOMS) {
// Get summed Born data // Get summed Born data
int totalSize = bufferSize*numBuffers; int totalSize = bufferSize*numBuffers;
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
float sum = (1.0f/0xFFFFFFFF)*bornSum[index]; real sum = (1/(real) 0xFFFFFFFF)*bornSum[index];
#else #else
float sum = bornSum[index]; real sum = bornSum[index];
for (int i = index+bufferSize; i < totalSize; i += bufferSize) for (int i = index+bufferSize; i < totalSize; i += bufferSize)
sum += bornSum[i]; sum += bornSum[i];
#endif #endif
...@@ -30,13 +30,13 @@ __kernel void reduceBornSum(int bufferSize, int numBuffers, float alpha, float b ...@@ -30,13 +30,13 @@ __kernel void reduceBornSum(int bufferSize, int numBuffers, float alpha, float b
float offsetRadius = params[index].x; float offsetRadius = params[index].x;
sum *= 0.5f*offsetRadius; sum *= 0.5f*offsetRadius;
float sum2 = sum*sum; real sum2 = sum*sum;
float sum3 = sum*sum2; real sum3 = sum*sum2;
float tanhSum = tanh(alpha*sum - beta*sum2 + gamma*sum3); real tanhSum = tanh(alpha*sum - beta*sum2 + gamma*sum3);
float nonOffsetRadius = offsetRadius + DIELECTRIC_OFFSET; real nonOffsetRadius = offsetRadius + DIELECTRIC_OFFSET;
float radius = 1.0f/(1.0f/offsetRadius - tanhSum/nonOffsetRadius); real radius = 1/(1/offsetRadius - tanhSum/nonOffsetRadius);
float chain = offsetRadius*(alpha - 2.0f*beta*sum + 3.0f*gamma*sum2); real chain = offsetRadius*(alpha - 2*beta*sum + 3*gamma*sum2);
chain = (1.0f-tanhSum*tanhSum)*chain / nonOffsetRadius; chain = (1-tanhSum*tanhSum)*chain / nonOffsetRadius;
bornRadii[index] = radius; bornRadii[index] = radius;
obcChain[index] = chain; obcChain[index] = chain;
index += get_global_size(0); index += get_global_size(0);
...@@ -47,31 +47,31 @@ __kernel void reduceBornSum(int bufferSize, int numBuffers, float alpha, float b ...@@ -47,31 +47,31 @@ __kernel void reduceBornSum(int bufferSize, int numBuffers, float alpha, float b
* Reduce the Born force. * Reduce the Born force.
*/ */
__kernel void reduceBornForce(int bufferSize, int numBuffers, __global float* bornForce, __kernel void reduceBornForce(int bufferSize, int numBuffers, __global real* bornForce,
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
__global const long* restrict bornForceIn, __global const long* restrict bornForceIn,
#endif #endif
__global float* restrict energyBuffer, __global const float2* restrict params, __global const float* restrict bornRadii, __global const float* restrict obcChain) { __global real* restrict energyBuffer, __global const float2* restrict params, __global const real* restrict bornRadii, __global const real* restrict obcChain) {
float energy = 0.0f; real energy = 0.0f;
unsigned int index = get_global_id(0); unsigned int index = get_global_id(0);
while (index < NUM_ATOMS) { while (index < NUM_ATOMS) {
// Sum the Born force // Sum the Born force
int totalSize = bufferSize*numBuffers; int totalSize = bufferSize*numBuffers;
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
float force = (1.0f/0xFFFFFFFF)*bornForceIn[index]; real force = (1/(real) 0xFFFFFFFF)*bornForceIn[index];
#else #else
float force = bornForce[index]; real force = bornForce[index];
for (int i = index+bufferSize; i < totalSize; i += bufferSize) for (int i = index+bufferSize; i < totalSize; i += bufferSize)
force += bornForce[i]; force += bornForce[i];
#endif #endif
// Now calculate the actual force // Now calculate the actual force
float offsetRadius = params[index].x; float offsetRadius = params[index].x;
float bornRadius = bornRadii[index]; real bornRadius = bornRadii[index];
float r = offsetRadius+DIELECTRIC_OFFSET+PROBE_RADIUS; real r = offsetRadius+DIELECTRIC_OFFSET+PROBE_RADIUS;
float ratio6 = pow((offsetRadius+DIELECTRIC_OFFSET)/bornRadius, 6.0f); real ratio6 = pow((offsetRadius+DIELECTRIC_OFFSET)/bornRadius, (real) 6);
float saTerm = SURFACE_AREA_FACTOR*r*r*ratio6; real saTerm = SURFACE_AREA_FACTOR*r*r*ratio6;
force += saTerm/bornRadius; force += saTerm/bornRadius;
energy += saTerm; energy += saTerm;
force *= bornRadius*bornRadius*obcChain[index]; force *= bornRadius*bornRadius*obcChain[index];
......
#define TILE_SIZE 32 #define TILE_SIZE 32
typedef struct { typedef struct {
float x, y, z; real x, y, z;
float q; real q;
float radius, scaledRadius; float radius, scaledRadius;
float bornSum; real bornSum;
} AtomData1; } AtomData1;
/** /**
* Compute the Born sum. * Compute the Born sum.
*/ */
__kernel void computeBornSum(__global float* restrict global_bornSum, __global const float4* restrict posq, __global const float2* restrict global_params, __kernel void computeBornSum(__global real* restrict global_bornSum, __global const real4* restrict posq, __global const float2* restrict global_params,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags) { __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags) {
#else #else
unsigned int numTiles) { unsigned int numTiles) {
#endif #endif
...@@ -53,7 +53,7 @@ __kernel void computeBornSum(__global float* restrict global_bornSum, __global c ...@@ -53,7 +53,7 @@ __kernel void computeBornSum(__global float* restrict global_bornSum, __global c
if (lasty != y) { if (lasty != y) {
for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) { for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
unsigned int j = y*TILE_SIZE + localAtomIndex; unsigned int j = y*TILE_SIZE + localAtomIndex;
float4 tempPosq = posq[j]; real4 tempPosq = posq[j];
localData[localAtomIndex].x = tempPosq.x; localData[localAtomIndex].x = tempPosq.x;
localData[localAtomIndex].y = tempPosq.y; localData[localAtomIndex].y = tempPosq.y;
localData[localAtomIndex].z = tempPosq.z; localData[localAtomIndex].z = tempPosq.z;
...@@ -68,31 +68,31 @@ __kernel void computeBornSum(__global float* restrict global_bornSum, __global c ...@@ -68,31 +68,31 @@ __kernel void computeBornSum(__global float* restrict global_bornSum, __global c
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) { for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int atom1 = x*TILE_SIZE+tgx; unsigned int atom1 = x*TILE_SIZE+tgx;
float bornSum = 0.0f; real bornSum = 0.0f;
float4 posq1 = posq[atom1]; real4 posq1 = posq[atom1];
float2 params1 = global_params[atom1]; float2 params1 = global_params[atom1];
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
float4 posq2 = (float4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q); real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz; delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif #endif
float r2 = dot(delta.xyz, delta.xyz); real r2 = dot(delta.xyz, delta.xyz);
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) { if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else #else
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) { if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
#endif #endif
float invR = RSQRT(r2); real invR = RSQRT(r2);
float r = RECIP(invR); real r = RECIP(invR);
float2 params2 = (float2) (localData[j].radius, localData[j].scaledRadius); float2 params2 = (float2) (localData[j].radius, localData[j].scaledRadius);
float rScaledRadiusJ = r+params2.y; real rScaledRadiusJ = r+params2.y;
if ((j != tgx) && (params1.x < rScaledRadiusJ)) { if ((j != tgx) && (params1.x < rScaledRadiusJ)) {
float l_ij = RECIP(max(params1.x, fabs(r-params2.y))); real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
float u_ij = RECIP(rScaledRadiusJ); real u_ij = RECIP(rScaledRadiusJ);
float l_ij2 = l_ij*l_ij; real l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij; real u_ij2 = u_ij*u_ij;
float ratio = LOG(u_ij * RECIP(l_ij)); real ratio = LOG(u_ij * RECIP(l_ij));
bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) + bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2); (0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2);
if (params1.x < params2.y-r) if (params1.x < params2.y-r)
...@@ -117,46 +117,46 @@ __kernel void computeBornSum(__global float* restrict global_bornSum, __global c ...@@ -117,46 +117,46 @@ __kernel void computeBornSum(__global float* restrict global_bornSum, __global c
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) { for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int atom1 = x*TILE_SIZE+tgx; unsigned int atom1 = x*TILE_SIZE+tgx;
float bornSum = 0.0f; real bornSum = 0.0f;
float4 posq1 = posq[atom1]; real4 posq1 = posq[atom1];
float2 params1 = global_params[atom1]; float2 params1 = global_params[atom1];
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
float4 posq2 = (float4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q); real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz; delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif #endif
float r2 = dot(delta.xyz, delta.xyz); real r2 = dot(delta.xyz, delta.xyz);
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) { if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else #else
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) { if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
#endif #endif
float invR = RSQRT(r2); real invR = RSQRT(r2);
float r = RECIP(invR); real r = RECIP(invR);
float2 params2 = (float2) (localData[j].radius, localData[j].scaledRadius); float2 params2 = (float2) (localData[j].radius, localData[j].scaledRadius);
float rScaledRadiusJ = r+params2.y; real rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) { if (params1.x < rScaledRadiusJ) {
float l_ij = RECIP(max(params1.x, fabs(r-params2.y))); real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
float u_ij = RECIP(rScaledRadiusJ); real u_ij = RECIP(rScaledRadiusJ);
float l_ij2 = l_ij*l_ij; real l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij; real u_ij2 = u_ij*u_ij;
float ratio = LOG(u_ij * RECIP(l_ij)); real ratio = LOG(u_ij * RECIP(l_ij));
bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) + bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2); (0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2);
if (params1.x < params2.y-r) if (params1.x < params2.y-r)
bornSum += 2.0f*(RECIP(params1.x)-l_ij); bornSum += 2.0f*(RECIP(params1.x)-l_ij);
} }
float rScaledRadiusI = r+params1.y; real rScaledRadiusI = r+params1.y;
if (params2.x < rScaledRadiusI) { if (params2.x < rScaledRadiusI) {
float l_ij = RECIP(max(params2.x, fabs(r-params1.y))); real l_ij = RECIP(max((real) params2.x, fabs(r-params1.y)));
float u_ij = RECIP(rScaledRadiusI); real u_ij = RECIP(rScaledRadiusI);
float l_ij2 = l_ij*l_ij; real l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij; real u_ij2 = u_ij*u_ij;
float ratio = LOG(u_ij * RECIP(l_ij)); real ratio = LOG(u_ij * RECIP(l_ij));
float term = l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) + real term = l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2); (0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2);
if (params2.x < params1.y-r) if (params2.x < params1.y-r)
term += 2.0f*(RECIP(params2.x)-l_ij); term += 2.0f*(RECIP(params2.x)-l_ij);
...@@ -184,20 +184,20 @@ __kernel void computeBornSum(__global float* restrict global_bornSum, __global c ...@@ -184,20 +184,20 @@ __kernel void computeBornSum(__global float* restrict global_bornSum, __global c
} }
typedef struct { typedef struct {
float x, y, z; real x, y, z;
float q; real q;
float fx, fy, fz, fw; real fx, fy, fz, fw;
float bornRadius; real bornRadius;
} AtomData2; } AtomData2;
/** /**
* First part of computing the GBSA interaction. * First part of computing the GBSA interaction.
*/ */
__kernel void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* restrict global_bornForce, __kernel void computeGBSAForce1(__global real4* restrict forceBuffers, __global real* restrict global_bornForce,
__global float* restrict energyBuffer, __global const float4* restrict posq, __global const float* restrict global_bornRadii, __global real* restrict energyBuffer, __global const real4* restrict posq, __global const real* restrict global_bornRadii,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags) { __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags) {
#else #else
unsigned int numTiles) { unsigned int numTiles) {
#endif #endif
...@@ -209,7 +209,7 @@ __kernel void computeGBSAForce1(__global float4* restrict forceBuffers, __global ...@@ -209,7 +209,7 @@ __kernel void computeGBSAForce1(__global float4* restrict forceBuffers, __global
unsigned int pos = get_group_id(0)*numTiles/get_num_groups(0); unsigned int pos = get_group_id(0)*numTiles/get_num_groups(0);
unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0); unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
#endif #endif
float energy = 0.0f; real energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF; unsigned int lasty = 0xFFFFFFFF;
__local AtomData2 localData[TILE_SIZE]; __local AtomData2 localData[TILE_SIZE];
...@@ -238,7 +238,7 @@ __kernel void computeGBSAForce1(__global float4* restrict forceBuffers, __global ...@@ -238,7 +238,7 @@ __kernel void computeGBSAForce1(__global float4* restrict forceBuffers, __global
if (lasty != y) { if (lasty != y) {
for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) { for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
unsigned int j = y*TILE_SIZE + localAtomIndex; unsigned int j = y*TILE_SIZE + localAtomIndex;
float4 tempPosq = posq[j]; real4 tempPosq = posq[j];
localData[localAtomIndex].x = tempPosq.x; localData[localAtomIndex].x = tempPosq.x;
localData[localAtomIndex].y = tempPosq.y; localData[localAtomIndex].y = tempPosq.y;
localData[localAtomIndex].z = tempPosq.z; localData[localAtomIndex].z = tempPosq.z;
...@@ -251,34 +251,34 @@ __kernel void computeGBSAForce1(__global float4* restrict forceBuffers, __global ...@@ -251,34 +251,34 @@ __kernel void computeGBSAForce1(__global float4* restrict forceBuffers, __global
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) { for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int atom1 = x*TILE_SIZE+tgx; unsigned int atom1 = x*TILE_SIZE+tgx;
float4 force = 0.0f; real4 force = 0.0f;
float4 posq1 = posq[atom1]; real4 posq1 = posq[atom1];
float bornRadius1 = global_bornRadii[atom1]; real bornRadius1 = global_bornRadii[atom1];
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
float4 posq2 = (float4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q); real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz; delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif #endif
float r2 = dot(delta.xyz, delta.xyz); real r2 = dot(delta.xyz, delta.xyz);
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) { if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else #else
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) { if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
#endif #endif
float invR = RSQRT(r2); real invR = RSQRT(r2);
float r = RECIP(invR); real r = RECIP(invR);
float bornRadius2 = localData[j].bornRadius; real bornRadius2 = localData[j].bornRadius;
float alpha2_ij = bornRadius1*bornRadius2; real alpha2_ij = bornRadius1*bornRadius2;
float D_ij = r2*RECIP(4.0f*alpha2_ij); real D_ij = r2*RECIP(4.0f*alpha2_ij);
float expTerm = EXP(-D_ij); real expTerm = EXP(-D_ij);
float denominator2 = r2 + alpha2_ij*expTerm; real denominator2 = r2 + alpha2_ij*expTerm;
float denominator = SQRT(denominator2); real denominator = SQRT(denominator2);
float tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator); real tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
float Gpol = tempEnergy*RECIP(denominator2); real Gpol = tempEnergy*RECIP(denominator2);
float dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij); real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
force.w += dGpol_dalpha2_ij*bornRadius2; force.w += dGpol_dalpha2_ij*bornRadius2;
float dEdR = Gpol*(1.0f - 0.25f*expTerm); real dEdR = Gpol*(1.0f - 0.25f*expTerm);
energy += 0.5f*tempEnergy; energy += 0.5f*tempEnergy;
force.xyz -= delta.xyz*dEdR; force.xyz -= delta.xyz*dEdR;
} }
...@@ -305,34 +305,34 @@ __kernel void computeGBSAForce1(__global float4* restrict forceBuffers, __global ...@@ -305,34 +305,34 @@ __kernel void computeGBSAForce1(__global float4* restrict forceBuffers, __global
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) { for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int atom1 = x*TILE_SIZE+tgx; unsigned int atom1 = x*TILE_SIZE+tgx;
float4 force = 0.0f; real4 force = 0.0f;
float4 posq1 = posq[atom1]; real4 posq1 = posq[atom1];
float bornRadius1 = global_bornRadii[atom1]; real bornRadius1 = global_bornRadii[atom1];
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
float4 posq2 = (float4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q); real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz; delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif #endif
float r2 = dot(delta.xyz, delta.xyz); real r2 = dot(delta.xyz, delta.xyz);
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) { if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else #else
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) { if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
#endif #endif
float invR = RSQRT(r2); real invR = RSQRT(r2);
float r = RECIP(invR); real r = RECIP(invR);
float bornRadius2 = localData[j].bornRadius; real bornRadius2 = localData[j].bornRadius;
float alpha2_ij = bornRadius1*bornRadius2; real alpha2_ij = bornRadius1*bornRadius2;
float D_ij = r2*RECIP(4.0f*alpha2_ij); real D_ij = r2*RECIP(4.0f*alpha2_ij);
float expTerm = EXP(-D_ij); real expTerm = EXP(-D_ij);
float denominator2 = r2 + alpha2_ij*expTerm; real denominator2 = r2 + alpha2_ij*expTerm;
float denominator = SQRT(denominator2); real denominator = SQRT(denominator2);
float tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator); real tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
float Gpol = tempEnergy*RECIP(denominator2); real Gpol = tempEnergy*RECIP(denominator2);
float dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij); real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
force.w += dGpol_dalpha2_ij*bornRadius2; force.w += dGpol_dalpha2_ij*bornRadius2;
float dEdR = Gpol*(1.0f - 0.25f*expTerm); real dEdR = Gpol*(1.0f - 0.25f*expTerm);
energy += tempEnergy; energy += tempEnergy;
delta.xyz *= dEdR; delta.xyz *= dEdR;
force.xyz -= delta.xyz; force.xyz -= delta.xyz;
...@@ -354,7 +354,7 @@ __kernel void computeGBSAForce1(__global float4* restrict forceBuffers, __global ...@@ -354,7 +354,7 @@ __kernel void computeGBSAForce1(__global float4* restrict forceBuffers, __global
for (int tgx = 0; tgx < TILE_SIZE; tgx++) { for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int offset = y*TILE_SIZE+tgx + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset = y*TILE_SIZE+tgx + get_group_id(0)*PADDED_NUM_ATOMS;
float4 f = forceBuffers[offset]; real4 f = forceBuffers[offset];
f.x += localData[tgx].fx; f.x += localData[tgx].fx;
f.y += localData[tgx].fy; f.y += localData[tgx].fy;
f.z += localData[tgx].fz; f.z += localData[tgx].fz;
......
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
#define TILE_SIZE 32 #define TILE_SIZE 32
typedef struct { typedef struct {
float x, y, z; real x, y, z;
float radius, scaledRadius; float radius, scaledRadius;
} AtomData1; } AtomData1;
...@@ -18,11 +18,11 @@ void computeBornSum( ...@@ -18,11 +18,11 @@ void computeBornSum(
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict global_bornSum, __global long* restrict global_bornSum,
#else #else
__global float* restrict global_bornSum, __global real* restrict global_bornSum,
#endif #endif
__global const float4* restrict posq, __global const float2* restrict global_params, __global const real4* restrict posq, __global const float2* restrict global_params,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) { __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles) {
#else #else
unsigned int numTiles) { unsigned int numTiles) {
#endif #endif
...@@ -36,8 +36,8 @@ void computeBornSum( ...@@ -36,8 +36,8 @@ void computeBornSum(
#endif #endif
unsigned int lasty = 0xFFFFFFFF; unsigned int lasty = 0xFFFFFFFF;
__local AtomData1 localData[TILE_SIZE]; __local AtomData1 localData[TILE_SIZE];
__local float localBornSum[FORCE_WORK_GROUP_SIZE]; __local real localBornSum[FORCE_WORK_GROUP_SIZE];
__local float localTemp[TILE_SIZE]; __local real localTemp[TILE_SIZE];
while (pos < end) { while (pos < end) {
// Extract the coordinates of this tile // Extract the coordinates of this tile
...@@ -62,8 +62,8 @@ void computeBornSum( ...@@ -62,8 +62,8 @@ void computeBornSum(
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int localForceOffset = get_local_id(0) & ~(TILE_SIZE-1); unsigned int localForceOffset = get_local_id(0) & ~(TILE_SIZE-1);
unsigned int atom1 = x*TILE_SIZE + tgx; unsigned int atom1 = x*TILE_SIZE + tgx;
float bornSum = 0.0f; real bornSum = 0.0f;
float4 posq1 = posq[atom1]; real4 posq1 = posq[atom1];
float2 params1 = global_params[atom1]; float2 params1 = global_params[atom1];
if (x == y) { if (x == y) {
// This tile is on the diagonal. // This tile is on the diagonal.
...@@ -77,27 +77,27 @@ void computeBornSum( ...@@ -77,27 +77,27 @@ void computeBornSum(
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
for (unsigned int j = 0; j < TILE_SIZE/2; j++) { for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
float4 delta = (float4) (localData[baseLocalAtom+j].x-posq1.x, localData[baseLocalAtom+j].y-posq1.y, localData[baseLocalAtom+j].z-posq1.z, 0.0f); real4 delta = (float4) (localData[baseLocalAtom+j].x-posq1.x, localData[baseLocalAtom+j].y-posq1.y, localData[baseLocalAtom+j].z-posq1.z, 0);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z; delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif #endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float invR = RSQRT(r2); real invR = RSQRT(r2);
float r = RECIP(invR); real r = RECIP(invR);
float2 params2 = (float2) (localData[baseLocalAtom+j].radius, localData[baseLocalAtom+j].scaledRadius); float2 params2 = (float2) (localData[baseLocalAtom+j].radius, localData[baseLocalAtom+j].scaledRadius);
float rScaledRadiusJ = r+params2.y; real rScaledRadiusJ = r+params2.y;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+baseLocalAtom+j < NUM_ATOMS && r2 < CUTOFF_SQUARED && (j+baseLocalAtom != tgx) && (params1.x < rScaledRadiusJ)); unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+baseLocalAtom+j < NUM_ATOMS && r2 < CUTOFF_SQUARED && (j+baseLocalAtom != tgx) && (params1.x < rScaledRadiusJ));
#else #else
unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+baseLocalAtom+j < NUM_ATOMS && (j+baseLocalAtom != tgx) && (params1.x < rScaledRadiusJ)); unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+baseLocalAtom+j < NUM_ATOMS && (j+baseLocalAtom != tgx) && (params1.x < rScaledRadiusJ));
#endif #endif
float l_ij = RECIP(max(params1.x, fabs(r-params2.y))); real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
float u_ij = RECIP(rScaledRadiusJ); real u_ij = RECIP(rScaledRadiusJ);
float l_ij2 = l_ij*l_ij; real l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij; real u_ij2 = u_ij*u_ij;
float ratio = LOG(u_ij * RECIP(l_ij)); real ratio = LOG(u_ij * RECIP(l_ij));
bornSum += select(0.0f, l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) + bornSum += select(0.0f, l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2), includeInteraction); (0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2), includeInteraction);
bornSum += select(0.0f, 2.0f*(RECIP(params1.x)-l_ij), includeInteraction && params1.x < params2.y-r); bornSum += select(0.0f, 2.0f*(RECIP(params1.x)-l_ij), includeInteraction && params1.x < params2.y-r);
...@@ -128,11 +128,11 @@ void computeBornSum( ...@@ -128,11 +128,11 @@ void computeBornSum(
if (lasty != y && get_local_id(0) < TILE_SIZE) { if (lasty != y && get_local_id(0) < TILE_SIZE) {
unsigned int j = y*TILE_SIZE + tgx; unsigned int j = y*TILE_SIZE + tgx;
float4 tempPosq = posq[j]; real4 tempPosq = posq[j];
localData[get_local_id(0)].x = tempPosq.x; localData[get_local_id(0)].x = tempPosq.x;
localData[get_local_id(0)].y = tempPosq.y; localData[get_local_id(0)].y = tempPosq.y;
localData[get_local_id(0)].z = tempPosq.z; localData[get_local_id(0)].z = tempPosq.z;
float2 tempParams = global_params[j]; real2 tempParams = global_params[j];
localData[get_local_id(0)].radius = tempParams.x; localData[get_local_id(0)].radius = tempParams.x;
localData[get_local_id(0)].scaledRadius = tempParams.y; localData[get_local_id(0)].scaledRadius = tempParams.y;
} }
...@@ -143,41 +143,41 @@ void computeBornSum( ...@@ -143,41 +143,41 @@ void computeBornSum(
unsigned int tj = (tgx+baseLocalAtom) & (TILE_SIZE-1); unsigned int tj = (tgx+baseLocalAtom) & (TILE_SIZE-1);
for (unsigned int j = 0; j < TILE_SIZE/2; j++) { for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
float4 delta = (float4) (localData[tj].x-posq1.x, localData[tj].y-posq1.y, localData[tj].z-posq1.z, 0.0f); real4 delta = (real4) (localData[tj].x-posq1.x, localData[tj].y-posq1.y, localData[tj].z-posq1.z, 0);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z; delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif #endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS && r2 < CUTOFF_SQUARED); unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS && r2 < CUTOFF_SQUARED);
#else #else
unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS); unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS);
#endif #endif
float invR = RSQRT(r2); real invR = RSQRT(r2);
float r = RECIP(invR); real r = RECIP(invR);
float2 params2 = (float2) (localData[tj].radius, localData[tj].scaledRadius); float2 params2 = (float2) (localData[tj].radius, localData[tj].scaledRadius);
float rScaledRadiusJ = r+params2.y; real rScaledRadiusJ = r+params2.y;
{ {
float l_ij = RECIP(max(params1.x, fabs(r-params2.y))); real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
float u_ij = RECIP(rScaledRadiusJ); real u_ij = RECIP(rScaledRadiusJ);
float l_ij2 = l_ij*l_ij; real l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij; real u_ij2 = u_ij*u_ij;
float ratio = LOG(u_ij * RECIP(l_ij)); real ratio = LOG(u_ij * RECIP(l_ij));
unsigned int includeTerm = (includeInteraction && params1.x < rScaledRadiusJ); unsigned int includeTerm = (includeInteraction && params1.x < rScaledRadiusJ);
bornSum += select(0.0f, l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) + bornSum += select(0.0f, l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2), includeTerm); (0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2), includeTerm);
bornSum += select(0.0f, 2.0f*(RECIP(params1.x)-l_ij), includeTerm && params1.x < params2.y-r); bornSum += select(0.0f, 2.0f*(RECIP(params1.x)-l_ij), includeTerm && params1.x < params2.y-r);
} }
float rScaledRadiusI = r+params1.y; real rScaledRadiusI = r+params1.y;
{ {
float l_ij = RECIP(max(params2.x, fabs(r-params1.y))); real l_ij = RECIP(max((real) params2.x, fabs(r-params1.y)));
float u_ij = RECIP(rScaledRadiusI); real u_ij = RECIP(rScaledRadiusI);
float l_ij2 = l_ij*l_ij; real l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij; real u_ij2 = u_ij*u_ij;
float ratio = LOG(u_ij * RECIP(l_ij)); real ratio = LOG(u_ij * RECIP(l_ij));
float term = l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) + real term = l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2); (0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2);
term += select(0.0f, 2.0f*(RECIP(params2.x)-l_ij), params2.x < params1.y-r); term += select(0.0f, 2.0f*(RECIP(params2.x)-l_ij), params2.x < params1.y-r);
localBornSum[tj+localForceOffset] += select(0.0f, term, includeInteraction && params2.x < rScaledRadiusI); localBornSum[tj+localForceOffset] += select(0.0f, term, includeInteraction && params2.x < rScaledRadiusI);
...@@ -206,8 +206,8 @@ void computeBornSum( ...@@ -206,8 +206,8 @@ void computeBornSum(
const unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS; const unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif #endif
// Do both loads before both stores to minimize store-load waits. // Do both loads before both stores to minimize store-load waits.
float sum1 = global_bornSum[offset1]; real sum1 = global_bornSum[offset1];
float sum2 = global_bornSum[offset2]; real sum2 = global_bornSum[offset2];
sum1 += bornSum + localTemp[tgx]; sum1 += bornSum + localTemp[tgx];
sum2 += localBornSum[get_local_id(0)] + localBornSum[get_local_id(0)+TILE_SIZE]; sum2 += localBornSum[get_local_id(0)] + localBornSum[get_local_id(0)+TILE_SIZE];
global_bornSum[offset1] = sum1; global_bornSum[offset1] = sum1;
...@@ -222,15 +222,15 @@ void computeBornSum( ...@@ -222,15 +222,15 @@ void computeBornSum(
} }
typedef struct { typedef struct {
float x, y, z, w; real x, y, z, w;
float padding; real padding;
} PaddedUnalignedFloat4; } PaddedUnalignedFloat4;
typedef struct { typedef struct {
float x, y, z; real x, y, z;
float q; real q;
float bornRadius; real bornRadius;
float temp_x, temp_y, temp_z, temp_w; real temp_x, temp_y, temp_z, temp_w;
} AtomData2; } AtomData2;
/** /**
...@@ -242,11 +242,11 @@ void computeGBSAForce1( ...@@ -242,11 +242,11 @@ void computeGBSAForce1(
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict forceBuffers, __global long* restrict global_bornForce, __global long* restrict forceBuffers, __global long* restrict global_bornForce,
#else #else
__global float4* restrict forceBuffers, __global float* restrict global_bornForce, __global real4* restrict forceBuffers, __global real* restrict global_bornForce,
#endif #endif
__global float* restrict energyBuffer, __global const float4* restrict posq, __global const float* restrict global_bornRadii, __global real* restrict energyBuffer, __global const real4* restrict posq, __global const real* restrict global_bornRadii,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) { __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles) {
#else #else
unsigned int numTiles) { unsigned int numTiles) {
#endif #endif
...@@ -258,7 +258,7 @@ void computeGBSAForce1( ...@@ -258,7 +258,7 @@ void computeGBSAForce1(
unsigned int pos = get_group_id(0)*numTiles/get_num_groups(0); unsigned int pos = get_group_id(0)*numTiles/get_num_groups(0);
unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0); unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
#endif #endif
float energy = 0.0f; real energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF; unsigned int lasty = 0xFFFFFFFF;
__local AtomData2 localData[TILE_SIZE]; __local AtomData2 localData[TILE_SIZE];
__local PaddedUnalignedFloat4 localForce[FORCE_WORK_GROUP_SIZE]; __local PaddedUnalignedFloat4 localForce[FORCE_WORK_GROUP_SIZE];
...@@ -286,9 +286,9 @@ void computeGBSAForce1( ...@@ -286,9 +286,9 @@ void computeGBSAForce1(
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int localForceOffset = get_local_id(0) & ~(TILE_SIZE-1); unsigned int localForceOffset = get_local_id(0) & ~(TILE_SIZE-1);
unsigned int atom1 = x*TILE_SIZE + tgx; unsigned int atom1 = x*TILE_SIZE + tgx;
float4 force = 0.0f; real4 force = 0.0f;
float4 posq1 = posq[atom1]; real4 posq1 = posq[atom1];
float bornRadius1 = global_bornRadii[atom1]; real bornRadius1 = global_bornRadii[atom1];
if (x == y) { if (x == y) {
// This tile is on the diagonal. // This tile is on the diagonal.
...@@ -302,26 +302,26 @@ void computeGBSAForce1( ...@@ -302,26 +302,26 @@ void computeGBSAForce1(
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
for (unsigned int j = 0; j < TILE_SIZE/2; j++) { for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+baseLocalAtom+j < NUM_ATOMS); unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+baseLocalAtom+j < NUM_ATOMS);
float4 posq2 = (float4) (localData[baseLocalAtom+j].x, localData[baseLocalAtom+j].y, localData[baseLocalAtom+j].z, localData[baseLocalAtom+j].q); real4 posq2 = (real4) (localData[baseLocalAtom+j].x, localData[baseLocalAtom+j].y, localData[baseLocalAtom+j].z, localData[baseLocalAtom+j].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z; delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif #endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float invR = RSQRT(r2); real invR = RSQRT(r2);
float r = RECIP(invR); real r = RECIP(invR);
float bornRadius2 = localData[baseLocalAtom+j].bornRadius; real bornRadius2 = localData[baseLocalAtom+j].bornRadius;
float alpha2_ij = bornRadius1*bornRadius2; real alpha2_ij = bornRadius1*bornRadius2;
float D_ij = r2*RECIP(4.0f*alpha2_ij); real D_ij = r2*RECIP(4.0f*alpha2_ij);
float expTerm = EXP(-D_ij); real expTerm = EXP(-D_ij);
float denominator2 = r2 + alpha2_ij*expTerm; real denominator2 = r2 + alpha2_ij*expTerm;
float denominator = SQRT(denominator2); real denominator = SQRT(denominator2);
float tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator); real tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
float Gpol = tempEnergy*RECIP(denominator2); real Gpol = tempEnergy*RECIP(denominator2);
float dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij); real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
float dEdR = Gpol*(1.0f - 0.25f*expTerm); real dEdR = Gpol*(1.0f - 0.25f*expTerm);
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
dEdR = select(dEdR, 0.0f, r2 > CUTOFF_SQUARED); dEdR = select(dEdR, 0.0f, r2 > CUTOFF_SQUARED);
tempEnergy = select(tempEnergy, 0.0f, r2 > CUTOFF_SQUARED); tempEnergy = select(tempEnergy, 0.0f, r2 > CUTOFF_SQUARED);
...@@ -355,9 +355,9 @@ void computeGBSAForce1( ...@@ -355,9 +355,9 @@ void computeGBSAForce1(
#else #else
const unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS; const unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif #endif
// Cheaper to load/store float4 than float3. Do all loads before all stores to minimize store-load waits. // Cheaper to load/store real4 than real3. Do all loads before all stores to minimize store-load waits.
float4 sum = forceBuffers[offset]; real4 sum = forceBuffers[offset];
float global_sum = global_bornForce[offset]; real global_sum = global_bornForce[offset];
sum.x += force.x + localData[tgx].temp_x; sum.x += force.x + localData[tgx].temp_x;
sum.y += force.y + localData[tgx].temp_y; sum.y += force.y + localData[tgx].temp_y;
sum.z += force.z + localData[tgx].temp_z; sum.z += force.z + localData[tgx].temp_z;
...@@ -373,7 +373,7 @@ void computeGBSAForce1( ...@@ -373,7 +373,7 @@ void computeGBSAForce1(
if (lasty != y && get_local_id(0) < TILE_SIZE) { if (lasty != y && get_local_id(0) < TILE_SIZE) {
unsigned int j = y*TILE_SIZE + tgx; unsigned int j = y*TILE_SIZE + tgx;
float4 tempPosq = posq[j]; real4 tempPosq = posq[j];
localData[get_local_id(0)].x = tempPosq.x; localData[get_local_id(0)].x = tempPosq.x;
localData[get_local_id(0)].y = tempPosq.y; localData[get_local_id(0)].y = tempPosq.y;
localData[get_local_id(0)].z = tempPosq.z; localData[get_local_id(0)].z = tempPosq.z;
...@@ -391,26 +391,26 @@ void computeGBSAForce1( ...@@ -391,26 +391,26 @@ void computeGBSAForce1(
unsigned int tj = (tgx+baseLocalAtom) & (TILE_SIZE-1); unsigned int tj = (tgx+baseLocalAtom) & (TILE_SIZE-1);
for (unsigned int j = 0; j < TILE_SIZE/2; j++) { for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS); unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS);
float4 posq2 = (float4) (localData[tj].x, localData[tj].y, localData[tj].z, localData[tj].q); real4 posq2 = (real4) (localData[tj].x, localData[tj].y, localData[tj].z, localData[tj].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z; delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif #endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float invR = RSQRT(r2); real invR = RSQRT(r2);
float r = RECIP(invR); real r = RECIP(invR);
float bornRadius2 = localData[tj].bornRadius; real bornRadius2 = localData[tj].bornRadius;
float alpha2_ij = bornRadius1*bornRadius2; real alpha2_ij = bornRadius1*bornRadius2;
float D_ij = r2*RECIP(4.0f*alpha2_ij); real D_ij = r2*RECIP(4.0f*alpha2_ij);
float expTerm = EXP(-D_ij); real expTerm = EXP(-D_ij);
float denominator2 = r2 + alpha2_ij*expTerm; real denominator2 = r2 + alpha2_ij*expTerm;
float denominator = SQRT(denominator2); real denominator = SQRT(denominator2);
float tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator); real tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
float Gpol = tempEnergy*RECIP(denominator2); real Gpol = tempEnergy*RECIP(denominator2);
float dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij); real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
float dEdR = Gpol*(1.0f - 0.25f*expTerm); real dEdR = Gpol*(1.0f - 0.25f*expTerm);
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
dEdR = select(dEdR, 0.0f, r2 > CUTOFF_SQUARED); dEdR = select(dEdR, 0.0f, r2 > CUTOFF_SQUARED);
tempEnergy = select(tempEnergy, 0.0f, r2 > CUTOFF_SQUARED); tempEnergy = select(tempEnergy, 0.0f, r2 > CUTOFF_SQUARED);
...@@ -458,11 +458,11 @@ void computeGBSAForce1( ...@@ -458,11 +458,11 @@ void computeGBSAForce1(
const unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS; const unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
const unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS; const unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif #endif
// Cheaper to load/store float4 than float3. Do all loads before all stores to minimize store-load waits. // Cheaper to load/store real4 than real3. Do all loads before all stores to minimize store-load waits.
float4 sum1 = forceBuffers[offset1]; real4 sum1 = forceBuffers[offset1];
float4 sum2 = forceBuffers[offset2]; real4 sum2 = forceBuffers[offset2];
float global_sum1 = global_bornForce[offset1]; real global_sum1 = global_bornForce[offset1];
float global_sum2 = global_bornForce[offset2]; real global_sum2 = global_bornForce[offset2];
sum1.x += force.x + localData[tgx].temp_x; sum1.x += force.x + localData[tgx].temp_x;
sum1.y += force.y + localData[tgx].temp_y; sum1.y += force.y + localData[tgx].temp_y;
sum1.z += force.z + localData[tgx].temp_z; sum1.z += force.z + localData[tgx].temp_z;
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
* Scale the particle positions. * Scale the particle positions.
*/ */
__kernel void scalePositions(float scale, int numMolecules, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global float4* restrict posq, __kernel void scalePositions(float scale, int numMolecules, real4 periodicBoxSize, real4 invPeriodicBoxSize, __global real4* restrict posq,
__global const int* restrict moleculeAtoms, __global const int* restrict moleculeStartIndex) { __global const int* restrict moleculeAtoms, __global const int* restrict moleculeStartIndex) {
for (int index = get_global_id(0); index < numMolecules; index += get_global_size(0)) { for (int index = get_global_id(0); index < numMolecules; index += get_global_size(0)) {
int first = moleculeStartIndex[index]; int first = moleculeStartIndex[index];
...@@ -11,24 +11,24 @@ __kernel void scalePositions(float scale, int numMolecules, float4 periodicBoxSi ...@@ -11,24 +11,24 @@ __kernel void scalePositions(float scale, int numMolecules, float4 periodicBoxSi
// Find the center of each molecule. // Find the center of each molecule.
float4 center = (float4) 0.0f; real4 center = (real4) 0;
for (int atom = first; atom < last; atom++) for (int atom = first; atom < last; atom++)
center += posq[moleculeAtoms[atom]]; center += posq[moleculeAtoms[atom]];
center /= (float) numAtoms; center /= (real) numAtoms;
// Move it into the first periodic box. // Move it into the first periodic box.
int xcell = (int) floor(center.x*invPeriodicBoxSize.x); int xcell = (int) floor(center.x*invPeriodicBoxSize.x);
int ycell = (int) floor(center.y*invPeriodicBoxSize.y); int ycell = (int) floor(center.y*invPeriodicBoxSize.y);
int zcell = (int) floor(center.z*invPeriodicBoxSize.z); int zcell = (int) floor(center.z*invPeriodicBoxSize.z);
float4 delta = (float4) (xcell*periodicBoxSize.x, ycell*periodicBoxSize.y, zcell*periodicBoxSize.z, 0); real4 delta = (real4) (xcell*periodicBoxSize.x, ycell*periodicBoxSize.y, zcell*periodicBoxSize.z, 0);
center -= delta; center -= delta;
// Now scale the position of the molecule center. // Now scale the position of the molecule center.
delta = center*(scale-1)-delta; delta = center*(scale-1)-delta;
for (int atom = first; atom < last; atom++) { for (int atom = first; atom < last; atom++) {
float4 pos = posq[moleculeAtoms[atom]]; real4 pos = posq[moleculeAtoms[atom]];
pos.xyz += delta.xyz; pos.xyz += delta.xyz;
posq[moleculeAtoms[atom]] = pos; posq[moleculeAtoms[atom]] = pos;
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment