Unverified Commit ae686364 authored by Peter Eastman's avatar Peter Eastman Committed by GitHub
Browse files

Improved support for devices without 64 bit atomics (#3737)

parent 48664a1f
......@@ -786,8 +786,6 @@ private:
ComputeArray globals;
ComputeArray donors;
ComputeArray acceptors;
ComputeArray donorBufferIndices;
ComputeArray acceptorBufferIndices;
ComputeArray donorExclusions;
ComputeArray acceptorExclusions;
std::vector<std::string> globalParamNames;
......
......@@ -1483,8 +1483,6 @@ void CommonCalcCustomCentroidBondForceKernel::initialize(const System& system, c
numBonds = force.getNumBonds();
if (numBonds == 0)
return;
if (!cc.getSupports64BitGlobalAtomics())
throw OpenMMException("CustomCentroidBondForce requires a device that supports 64 bit atomic operations");
info = new ForceInfo(force);
cc.addForce(info);
......@@ -2380,8 +2378,7 @@ double CommonCalcCustomNonbondedForceKernel::execute(ContextImpl& context, bool
if (interactionGroupData.isInitialized()) {
if (!hasInitializedKernel) {
hasInitializedKernel = true;
bool useLong = cc.getSupports64BitGlobalAtomics();
interactionGroupKernel->addArg((useLong ? cc.getLongForceBuffer() : cc.getForceBuffers()));
interactionGroupKernel->addArg(cc.getLongForceBuffer());
interactionGroupKernel->addArg(cc.getEnergyBuffer());
interactionGroupKernel->addArg(cc.getPosq());
interactionGroupKernel->addArg((useNeighborList ? filteredGroupData : interactionGroupData));
......@@ -2501,15 +2498,8 @@ void CommonCalcGBSAOBCForceKernel::initialize(const System& system, const GBSAOB
charges.initialize(cc, cc.getPaddedNumAtoms(), elementSize, "gbsaObcCharges");
bornRadii.initialize(cc, cc.getPaddedNumAtoms(), elementSize, "bornRadii");
obcChain.initialize(cc, cc.getPaddedNumAtoms(), elementSize, "obcChain");
if (cc.getSupports64BitGlobalAtomics()) {
bornSum.initialize<long long>(cc, cc.getPaddedNumAtoms(), "bornSum");
bornForce.initialize<long long>(cc, cc.getPaddedNumAtoms(), "bornForce");
}
else {
int bufferSize = cc.getPaddedNumAtoms()*nb.getNumForceBuffers();
bornSum.initialize(cc, bufferSize, elementSize, "bornSum");
bornForce.initialize(cc, bufferSize, elementSize, "bornForce");
}
cc.addAutoclearBuffer(bornSum);
cc.addAutoclearBuffer(bornForce);
vector<double> chargeVec(cc.getPaddedNumAtoms());
......@@ -2541,7 +2531,7 @@ void CommonCalcGBSAOBCForceKernel::initialize(const System& system, const GBSAOB
nb.addInteraction(useCutoff, usePeriodic, false, cutoff, vector<vector<int> >(), source, force.getForceGroup());
nb.addParameter(ComputeParameterInfo(charges, prefix+"charge", "float", 1));
nb.addParameter(ComputeParameterInfo(params, prefix+"obcParams", "float", 2));
nb.addParameter(ComputeParameterInfo(bornForce, prefix+"bornForce", cc.getSupports64BitGlobalAtomics() ? "mm_long" : "real", 1));
nb.addParameter(ComputeParameterInfo(bornForce, prefix+"bornForce", "mm_long", 1));
info = new ForceInfo(force);
cc.addForce(info);
}
......@@ -2580,7 +2570,6 @@ double CommonCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
else
file = CommonKernelSources::gbsaObc;
ComputeProgram program = cc.compileProgram(file, defines);
bool useLong = cc.getSupports64BitGlobalAtomics();
computeBornSumKernel = program->createKernel("computeBornSum");
computeBornSumKernel->addArg(bornSum);
computeBornSumKernel->addArg(cc.getPosq());
......@@ -2600,7 +2589,7 @@ double CommonCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
computeBornSumKernel->addArg(numAtomBlocks*(numAtomBlocks+1)/2);
computeBornSumKernel->addArg(nb.getExclusionTiles());
force1Kernel = program->createKernel("computeGBSAForce1");
force1Kernel->addArg(useLong ? cc.getLongForceBuffer() : cc.getForceBuffers());
force1Kernel->addArg(cc.getLongForceBuffer());
force1Kernel->addArg(bornForce);
force1Kernel->addArg(cc.getEnergyBuffer());
force1Kernel->addArg(cc.getPosq());
......@@ -2626,19 +2615,11 @@ double CommonCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
reduceBornSumKernel->addArg(0.8f);
reduceBornSumKernel->addArg(4.85f);
reduceBornSumKernel->addArg(bornSum);
if (!useLong) {
reduceBornSumKernel->addArg(cc.getPaddedNumAtoms());
reduceBornSumKernel->addArg(cc.getForceBuffers().getSize()/cc.getPaddedNumAtoms());
}
reduceBornSumKernel->addArg(params);
reduceBornSumKernel->addArg(bornRadii);
reduceBornSumKernel->addArg(obcChain);
reduceBornForceKernel = program->createKernel("reduceBornForce");
reduceBornForceKernel->addArg(bornForce);
if (!useLong) {
reduceBornForceKernel->addArg(cc.getPaddedNumAtoms());
reduceBornForceKernel->addArg(cc.getForceBuffers().getSize()/cc.getPaddedNumAtoms());
}
reduceBornForceKernel->addArg(cc.getEnergyBuffer());
reduceBornForceKernel->addArg(params);
reduceBornForceKernel->addArg(bornRadii);
......@@ -2872,28 +2853,17 @@ void CommonCalcCustomGBForceKernel::initialize(const System& system, const Custo
energyParamDerivExpressions[i].push_back(ex.differentiate(force.getEnergyParameterDerivativeName(j)).optimize());
}
bool deviceIsCpu = cc.getIsCPU();
bool useLong = cc.getSupports64BitGlobalAtomics();
int elementSize = (cc.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
if (useLong) {
valueBuffers.initialize<long long>(cc, cc.getPaddedNumAtoms(), "customGBValueBuffers");
longEnergyDerivs.initialize<long long>(cc, numComputedValues*cc.getPaddedNumAtoms(), "customGBLongEnergyDerivatives");
energyDerivs = new ComputeParameterSet(cc, numComputedValues, cc.getPaddedNumAtoms(), "customGBEnergyDerivatives", true);
}
else {
int bufferSize = cc.getPaddedNumAtoms()*nb.getNumForceBuffers();
valueBuffers.initialize(cc, bufferSize, elementSize, "customGBValueBuffers");
energyDerivs = new ComputeParameterSet(cc, numComputedValues, bufferSize, "customGBEnergyDerivatives", true);
}
cc.addAutoclearBuffer(valueBuffers);
energyDerivChain = new ComputeParameterSet(cc, numComputedValues, cc.getPaddedNumAtoms(), "customGBEnergyDerivativeChain", true);
needEnergyParamDerivs = (force.getNumEnergyParameterDerivatives() > 0);
dValue0dParam.resize(force.getNumEnergyParameterDerivatives());
for (int i = 0; i < force.getNumEnergyParameterDerivatives(); i++) {
dValuedParam.push_back(new ComputeParameterSet(cc, numComputedValues, cc.getPaddedNumAtoms(), "dValuedParam", true, cc.getUseDoublePrecision()));
if (useLong)
dValue0dParam[i].initialize<long long>(cc, cc.getPaddedNumAtoms(), "dValue0dParam");
else
dValue0dParam[i].initialize(cc, cc.getPaddedNumAtoms()*nb.getNumForceBuffers(), elementSize, "dValue0dParam");
cc.addAutoclearBuffer(dValue0dParam[i]);
string name = force.getEnergyParameterDerivativeName(i);
cc.addEnergyParameterDerivative(name);
......@@ -2960,10 +2930,7 @@ void CommonCalcCustomGBForceKernel::initialize(const System& system, const Custo
}
for (int i = 0; i < force.getNumEnergyParameterDerivatives(); i++) {
string derivName = "dValue0dParam"+cc.intToString(i+1);
if (useLong)
extraArgs << ", GLOBAL mm_ulong* RESTRICT global_" << derivName;
else
extraArgs << ", GLOBAL real* RESTRICT global_" << derivName;
atomParams << "LOCAL real local_" << derivName << "[LOCAL_BUFFER_SIZE];\n";
loadLocal2 << "local_" << derivName << "[localAtomIndex] = 0;\n";
load1 << "real " << derivName << " = 0;\n";
......@@ -2975,21 +2942,12 @@ void CommonCalcCustomGBForceKernel::initialize(const System& system, const Custo
tempDerivs2 << "local_" << derivName << "[j] += temp_" << derivName << "_2;\n";
else
tempDerivs2 << "local_" << derivName << "[tbx+tj] += temp_" << derivName << "_2;\n";
if (useLong) {
storeDeriv1 << "ATOMIC_ADD(&global_" << derivName << "[offset1], (mm_ulong) realToFixedPoint(" << derivName << "));\n";
if (deviceIsCpu)
storeDeriv2 << "ATOMIC_ADD(&global_" << derivName << "[offset2], (mm_ulong) realToFixedPoint(local_" << derivName << "[tgx]));\n";
else
storeDeriv2 << "ATOMIC_ADD(&global_" << derivName << "[offset2], (mm_ulong) realToFixedPoint(local_" << derivName << "[LOCAL_ID]));\n";
}
else {
storeDeriv1 << "global_" << derivName << "[offset1] += " << derivName << ";\n";
if (deviceIsCpu)
storeDeriv2 << "global_" << derivName << "[offset2] += local_" << derivName << "[tgx];\n";
else
storeDeriv2 << "global_" << derivName << "[offset2] += local_" << derivName << "[LOCAL_ID];\n";
}
}
}
replacements["PARAMETER_ARGUMENTS"] = extraArgs.str()+tableArgs.str();
replacements["ATOM_PARAMETER_DATA"] = atomParams.str();
......@@ -3039,16 +2997,8 @@ void CommonCalcCustomGBForceKernel::initialize(const System& system, const Custo
}
for (int i = 0; i < force.getNumEnergyParameterDerivatives(); i++) {
string variableName = "dValuedParam_0_"+cc.intToString(i);
if (useLong) {
extraArgs << ", GLOBAL const mm_long* RESTRICT dValue0dParam" << i;
deriv0 << "real " << variableName << " = RECIP((real) 0x100000000)*dValue0dParam" << i << "[index];\n";
}
else {
extraArgs << ", GLOBAL const real* RESTRICT dValue0dParam" << i;
deriv0 << "real " << variableName << " = dValue0dParam" << i << "[index];\n";
deriv0 << "for (int i = index+bufferSize; i < totalSize; i += bufferSize)\n";
deriv0 << " " << variableName << " += dValue0dParam" << i << "[i];\n";
}
for (int j = 0; j < dValuedParam[i]->getParameterInfos().size(); j++)
extraArgs << ", GLOBAL real* RESTRICT global_dValuedParam_" << j << "_" << i;
deriv0 << "global_dValuedParam_0_" << i << "[index] = dValuedParam_0_" << i << ";\n";
......@@ -3129,7 +3079,6 @@ void CommonCalcCustomGBForceKernel::initialize(const System& system, const Custo
map<string, Lepton::ParsedExpression> n2EnergyExpressions;
n2EnergyExpressions["tempEnergy += "] = Lepton::Parser::parse(expression, functions).optimize();
n2EnergyExpressions["dEdR += "] = Lepton::Parser::parse(expression, functions).differentiate("r").optimize();
if (useLong) {
for (int j = 0; j < numComputedValues; j++) {
if (needChainForValue[j]) {
string index = cc.intToString(j+1);
......@@ -3137,15 +3086,6 @@ void CommonCalcCustomGBForceKernel::initialize(const System& system, const Custo
n2EnergyExpressions["/*"+cc.intToString(i+1)+"*/ deriv"+index+"_2 += "] = energyDerivExpressions[i][2*j+1];
}
}
}
else {
for (int j = 0; j < numComputedValues; j++) {
if (needChainForValue[j]) {
n2EnergyExpressions["/*"+cc.intToString(i+1)+"*/ deriv"+energyDerivs->getParameterSuffix(j, "_1")+" += "] = energyDerivExpressions[i][2*j];
n2EnergyExpressions["/*"+cc.intToString(i+1)+"*/ deriv"+energyDerivs->getParameterSuffix(j, "_2")+" += "] = energyDerivExpressions[i][2*j+1];
}
}
}
for (int j = 0; j < force.getNumEnergyParameterDerivatives(); j++)
n2EnergyExpressions["energyParamDeriv"+cc.intToString(j)+" += interactionScale*"] = energyParamDerivExpressions[i][j];
if (exclude)
......@@ -3188,7 +3128,6 @@ void CommonCalcCustomGBForceKernel::initialize(const System& system, const Custo
pairEnergyUsesValue[i] = true;
}
}
if (useLong) {
extraArgs << ", GLOBAL mm_ulong* RESTRICT derivBuffers";
for (int i = 0; i < numComputedValues; i++) {
string index = cc.intToString(i+1);
......@@ -3200,21 +3139,6 @@ void CommonCalcCustomGBForceKernel::initialize(const System& system, const Custo
storeDerivs1 << "STORE_DERIVATIVE_1(" << index << ")\n";
storeDerivs2 << "STORE_DERIVATIVE_2(" << index << ")\n";
}
}
else {
for (int i = 0; i < (int) energyDerivs->getParameterInfos().size(); i++) {
const ComputeParameterInfo& buffer = energyDerivs->getParameterInfos()[i];
string index = cc.intToString(i+1);
extraArgs << ", GLOBAL " << buffer.getType() << "* RESTRICT derivBuffers" << index;
atomParams << "LOCAL " << buffer.getType() << " local_deriv" << index << "[LOCAL_BUFFER_SIZE];\n";
clearLocal << "local_deriv" << index << "[localAtomIndex] = 0.0f;\n";
declare1 << buffer.getType() << " deriv" << index << "_1 = 0.0f;\n";
load2 << buffer.getType() << " deriv" << index << "_2 = 0.0f;\n";
recordDeriv << "local_deriv" << index << "[atom2] += deriv" << index << "_2;\n";
storeDerivs1 << "STORE_DERIVATIVE_1(" << index << ")\n";
storeDerivs2 << "STORE_DERIVATIVE_2(" << index << ")\n";
}
}
if (needEnergyParamDerivs) {
extraArgs << ", GLOBAL mixed* RESTRICT energyParamDerivs";
const vector<string>& allParamDerivNames = cc.getEnergyParamDerivNames();
......@@ -3285,16 +3209,10 @@ void CommonCalcCustomGBForceKernel::initialize(const System& system, const Custo
string index = cc.intToString(i+1);
extraArgs << ", GLOBAL " << buffer.getType() << "* RESTRICT derivChain" << index;
}
if (useLong) {
extraArgs << ", GLOBAL const mm_long* RESTRICT derivBuffersIn";
for (int i = 0; i < energyDerivs->getNumParameters(); ++i)
reduce << "derivBuffers" << energyDerivs->getParameterSuffix(i, "[index]") <<
" = RECIP((real) 0x100000000)*derivBuffersIn[index+PADDED_NUM_ATOMS*" << cc.intToString(i) << "];\n";
}
else {
for (int i = 0; i < (int) energyDerivs->getParameterInfos().size(); i++)
reduce << "REDUCE_VALUE(derivBuffers" << cc.intToString(i+1) << ", " << energyDerivs->getParameterInfos()[i].getType() << ")\n";
}
if (needEnergyParamDerivs) {
extraArgs << ", GLOBAL mixed* RESTRICT energyParamDerivs";
const vector<string>& allParamDerivNames = cc.getEnergyParamDerivNames();
......@@ -3353,13 +3271,9 @@ void CommonCalcCustomGBForceKernel::initialize(const System& system, const Custo
string index = cc.intToString(i+1);
compute << "derivBuffers" << index << "[index] = deriv" << index << ";\n";
}
if (useLong) {
compute << "forceBuffers[index] += realToFixedPoint(force.x);\n";
compute << "forceBuffers[index+PADDED_NUM_ATOMS] += realToFixedPoint(force.y);\n";
compute << "forceBuffers[index+PADDED_NUM_ATOMS*2] += realToFixedPoint(force.z);\n";
}
else
compute << "forceBuffers[index] = forceBuffers[index]+make_real4(force.x, force.y, force.z, 0);\n";
for (int i = 1; i < numComputedValues; i++) {
compute << "real totalDeriv"<<i<<" = dV"<<i<<"dV0";
for (int j = 1; j < i; j++)
......@@ -3548,12 +3462,7 @@ void CommonCalcCustomGBForceKernel::initialize(const System& system, const Custo
}
info = new ForceInfo(force);
cc.addForce(info);
if (useLong)
cc.addAutoclearBuffer(longEnergyDerivs);
else {
for (auto& buffer : energyDerivs->getParameterInfos())
cc.addAutoclearBuffer(buffer.getArray());
}
}
double CommonCalcCustomGBForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
......@@ -3600,7 +3509,6 @@ double CommonCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
maxTiles = (nb.getUseCutoff() ? nb.getInteractingTiles().getSize() : 0);
int numAtomBlocks = cc.getPaddedNumAtoms()/32;
bool useLong = cc.getSupports64BitGlobalAtomics();
pairValueKernel->addArg(cc.getPosq());
pairValueKernel->addArg(cc.getNonbondedUtilities().getExclusions());
pairValueKernel->addArg(cc.getNonbondedUtilities().getExclusionTiles());
......@@ -3631,10 +3539,6 @@ double CommonCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
pairValueKernel->addArg(function);
perParticleValueKernel->addArg(cc.getPosq());
perParticleValueKernel->addArg(valueBuffers);
if (!useLong) {
perParticleValueKernel->addArg(cc.getPaddedNumAtoms());
perParticleValueKernel->addArg(cc.getForceBuffers().getSize()/cc.getPaddedNumAtoms());
}
if (globals.isInitialized())
perParticleValueKernel->addArg(globals);
for (auto& buffer : params->getParameterInfos())
......@@ -3648,7 +3552,7 @@ double CommonCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
}
for (auto& function : tabulatedFunctionArrays)
perParticleValueKernel->addArg(function);
pairEnergyKernel->addArg(useLong ? cc.getLongForceBuffer() : cc.getForceBuffers());
pairEnergyKernel->addArg(cc.getLongForceBuffer());
pairEnergyKernel->addArg(cc.getEnergyBuffer());
pairEnergyKernel->addArg(cc.getPosq());
pairEnergyKernel->addArg(cc.getNonbondedUtilities().getExclusions());
......@@ -3680,24 +3584,14 @@ double CommonCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
pairEnergyKernel->addArg(buffer.getArray());
}
}
if (useLong)
pairEnergyKernel->addArg(longEnergyDerivs);
else
for (auto& buffer : energyDerivs->getParameterInfos())
pairEnergyKernel->addArg(buffer.getArray());
if (needEnergyParamDerivs)
pairEnergyKernel->addArg(cc.getEnergyParamDerivBuffer());
for (auto& function : tabulatedFunctionArrays)
pairEnergyKernel->addArg(function);
perParticleEnergyKernel->addArg(cc.getEnergyBuffer());
perParticleEnergyKernel->addArg(cc.getPosq());
if (cc.getSupports64BitGlobalAtomics())
perParticleEnergyKernel->addArg(cc.getLongForceBuffer());
else {
perParticleEnergyKernel->addArg(cc.getForceBuffers());
perParticleEnergyKernel->addArg(cc.getPaddedNumAtoms());
perParticleEnergyKernel->addArg(cc.getForceBuffers().getSize()/cc.getPaddedNumAtoms());
}
if (globals.isInitialized())
perParticleEnergyKernel->addArg(globals);
for (auto& buffer : params->getParameterInfos())
......@@ -3708,7 +3602,6 @@ double CommonCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
perParticleEnergyKernel->addArg(buffer.getArray());
for (auto& buffer : energyDerivChain->getParameterInfos())
perParticleEnergyKernel->addArg(buffer.getArray());
if (useLong)
perParticleEnergyKernel->addArg(longEnergyDerivs);
if (needEnergyParamDerivs)
perParticleEnergyKernel->addArg(cc.getEnergyParamDerivBuffer());
......@@ -3716,7 +3609,7 @@ double CommonCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
perParticleEnergyKernel->addArg(function);
if (needParameterGradient || needEnergyParamDerivs) {
gradientChainRuleKernel->addArg(cc.getPosq());
gradientChainRuleKernel->addArg(useLong ? cc.getLongForceBuffer() : cc.getForceBuffers());
gradientChainRuleKernel->addArg(cc.getLongForceBuffer());
if (globals.isInitialized())
gradientChainRuleKernel->addArg(globals);
for (auto& buffer : params->getParameterInfos())
......@@ -3937,33 +3830,6 @@ void CommonCalcCustomHbondForceKernel::initialize(const System& system, const Cu
}
acceptors.upload(acceptorVector);
acceptorParams->setParameterValues(acceptorParamVector);
// Select an output buffer index for each donor and acceptor.
if (!cc.getSupports64BitGlobalAtomics()) {
donorBufferIndices.initialize<mm_int4>(cc, numDonors, "customHbondDonorBuffers");
acceptorBufferIndices.initialize<mm_int4>(cc, numAcceptors, "customHbondAcceptorBuffers");
vector<mm_int4> donorBufferVector(numDonors);
vector<mm_int4> acceptorBufferVector(numAcceptors);
vector<int> donorBufferCounter(numParticles, 0);
for (int i = 0; i < numDonors; i++)
donorBufferVector[i] = mm_int4(donorVector[i].x > -1 ? donorBufferCounter[donorVector[i].x]++ : 0,
donorVector[i].y > -1 ? donorBufferCounter[donorVector[i].y]++ : 0,
donorVector[i].z > -1 ? donorBufferCounter[donorVector[i].z]++ : 0, 0);
vector<int> acceptorBufferCounter(numParticles, 0);
for (int i = 0; i < numAcceptors; i++)
acceptorBufferVector[i] = mm_int4(acceptorVector[i].x > -1 ? acceptorBufferCounter[acceptorVector[i].x]++ : 0,
acceptorVector[i].y > -1 ? acceptorBufferCounter[acceptorVector[i].y]++ : 0,
acceptorVector[i].z > -1 ? acceptorBufferCounter[acceptorVector[i].z]++ : 0, 0);
donorBufferIndices.upload(donorBufferVector);
acceptorBufferIndices.upload(acceptorBufferVector);
int maxBuffers = 1;
for (int i : donorBufferCounter)
maxBuffers = max(maxBuffers, i);
for (int i : acceptorBufferCounter)
maxBuffers = max(maxBuffers, i);
cc.requestForceBuffers(maxBuffers);
}
info = new ForceInfo(force);
cc.addForce(info);
......@@ -4243,12 +4109,7 @@ double CommonCalcCustomHbondForceKernel::execute(ContextImpl& context, bool incl
}
if (!hasInitializedKernel) {
hasInitializedKernel = true;
if (cc.getSupports64BitGlobalAtomics())
donorKernel->addArg(cc.getLongForceBuffer());
else {
donorKernel->addArg(cc.getForceBuffers());
donorKernel->addArg(donorBufferIndices);
}
donorKernel->addArg(cc.getEnergyBuffer());
donorKernel->addArg(cc.getPosq());
donorKernel->addArg(donorExclusions);
......@@ -4264,12 +4125,7 @@ double CommonCalcCustomHbondForceKernel::execute(ContextImpl& context, bool incl
donorKernel->addArg(parameter.getArray());
for (auto& function : tabulatedFunctionArrays)
donorKernel->addArg(function);
if (cc.getSupports64BitGlobalAtomics())
acceptorKernel->addArg(cc.getLongForceBuffer());
else {
acceptorKernel->addArg(cc.getForceBuffers());
acceptorKernel->addArg(acceptorBufferIndices);
}
acceptorKernel->addArg(cc.getEnergyBuffer());
acceptorKernel->addArg(cc.getPosq());
acceptorKernel->addArg(acceptorExclusions);
......@@ -4286,9 +4142,9 @@ double CommonCalcCustomHbondForceKernel::execute(ContextImpl& context, bool incl
for (auto& function : tabulatedFunctionArrays)
acceptorKernel->addArg(function);
}
setPeriodicBoxArgs(cc, donorKernel, cc.getSupports64BitGlobalAtomics() ? 6 : 7);
setPeriodicBoxArgs(cc, donorKernel, 6);
donorKernel->execute(max(numDonors, numAcceptors), 64);
setPeriodicBoxArgs(cc, acceptorKernel, cc.getSupports64BitGlobalAtomics() ? 6 : 7);
setPeriodicBoxArgs(cc, acceptorKernel, 6);
acceptorKernel->execute(max(numDonors, numAcceptors), 64);
return 0.0;
}
......@@ -4391,8 +4247,6 @@ CommonCalcCustomManyParticleForceKernel::~CommonCalcCustomManyParticleForceKerne
void CommonCalcCustomManyParticleForceKernel::initialize(const System& system, const CustomManyParticleForce& force) {
ContextSelector selector(cc);
if (!cc.getSupports64BitGlobalAtomics())
throw OpenMMException("CustomManyParticleForce requires a device that supports 64 bit atomic operations");
int numParticles = force.getNumParticles();
int particlesPerSet = force.getNumParticlesPerSet();
bool centralParticleMode = (force.getPermutationMode() == CustomManyParticleForce::UniqueCentralParticle);
......@@ -4933,9 +4787,6 @@ private:
};
void CommonCalcGayBerneForceKernel::initialize(const System& system, const GayBerneForce& force) {
if (!cc.getSupports64BitGlobalAtomics())
throw OpenMMException("GayBerneForce requires a device that supports 64 bit atomic operations");
// Initialize interactions.
ContextSelector selector(cc);
......
......@@ -528,8 +528,6 @@ IntegrationUtilities::IntegrationUtilities(ComputeContext& context, const System
for (int i = 0; i < numAtoms; i++)
if (atomCounts[i] > 1)
hasOverlappingVsites = true;
if (hasOverlappingVsites && !context.getSupports64BitGlobalAtomics())
throw OpenMMException("This device does not support 64 bit atomics. Cannot have multiple virtual sites that depend on the same atom.");
// Create the kernels used by this class.
......
#ifdef SUPPORTS_64_BIT_ATOMICS
#define STORE_DERIVATIVE_1(INDEX) ATOMIC_ADD(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(deriv##INDEX##_1));
#define STORE_DERIVATIVE_2(INDEX) ATOMIC_ADD(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(local_deriv##INDEX[LOCAL_ID]));
#else
#define STORE_DERIVATIVE_1(INDEX) derivBuffers##INDEX[offset] += deriv##INDEX##_1;
#define STORE_DERIVATIVE_2(INDEX) derivBuffers##INDEX[offset] += local_deriv##INDEX[LOCAL_ID];
#endif
/**
* Compute a force based on pair interactions.
*/
KERNEL void computeN2Energy(
#ifdef SUPPORTS_64_BIT_ATOMICS
GLOBAL mm_ulong* RESTRICT forceBuffers,
#else
GLOBAL real4* RESTRICT forceBuffers,
#endif
GLOBAL mixed* RESTRICT energyBuffer,
GLOBAL const real4* RESTRICT posq, GLOBAL const unsigned int* RESTRICT exclusions,
GLOBAL const int2* exclusionTiles, int needEnergy,
......@@ -160,7 +151,6 @@ KERNEL void computeN2Energy(
// Write results.
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = x*TILE_SIZE + tgx;
ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) realToFixedPoint(force.x));
ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
......@@ -173,18 +163,6 @@ KERNEL void computeN2Energy(
ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(local_force[LOCAL_ID].z));
STORE_DERIVATIVES_2
}
#else
unsigned int offset1 = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset = offset1;
forceBuffers[offset1].xyz += force.xyz;
STORE_DERIVATIVES_1
if (x != y) {
offset = offset2;
forceBuffers[offset2] += (real4) (local_force[LOCAL_ID].x, local_force[LOCAL_ID].y, local_force[LOCAL_ID].z, 0.0f);
STORE_DERIVATIVES_2
}
#endif
}
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
......@@ -363,7 +341,6 @@ KERNEL void computeN2Energy(
#else
unsigned int atom2 = y*TILE_SIZE + tgx;
#endif
#ifdef SUPPORTS_64_BIT_ATOMICS
ATOMIC_ADD(&forceBuffers[atom1], (mm_ulong) realToFixedPoint(force.x));
ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.z));
......@@ -376,18 +353,6 @@ KERNEL void computeN2Energy(
offset = atom2;
STORE_DERIVATIVES_2
}
#else
unsigned int offset1 = atom1 + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = atom2 + warp*PADDED_NUM_ATOMS;
forceBuffers[offset1].xyz += force.xyz;
unsigned int offset = offset1;
STORE_DERIVATIVES_1
if (atom2 < PADDED_NUM_ATOMS) {
forceBuffers[offset2] += (real4) (local_force[LOCAL_ID].x, local_force[LOCAL_ID].y, local_force[LOCAL_ID].z, 0.0f);
offset = offset2;
STORE_DERIVATIVES_2
}
#endif
}
pos++;
}
......
#ifdef SUPPORTS_64_BIT_ATOMICS
#define STORE_DERIVATIVE_1(INDEX) ATOMIC_ADD(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(deriv##INDEX##_1));
#define STORE_DERIVATIVE_2(INDEX) ATOMIC_ADD(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(local_deriv##INDEX[tgx]));
#else
#define STORE_DERIVATIVE_1(INDEX) derivBuffers##INDEX[offset] += deriv##INDEX##_1;
#define STORE_DERIVATIVE_2(INDEX) derivBuffers##INDEX[offset] += local_deriv##INDEX[tgx];
#endif
/**
* Compute a force based on pair interactions.
*/
KERNEL void computeN2Energy(
#ifdef SUPPORTS_64_BIT_ATOMICS
GLOBAL mm_ulong* RESTRICT forceBuffers,
#else
GLOBAL real4* RESTRICT forceBuffers,
#endif
GLOBAL mixed* RESTRICT energyBuffer,
GLOBAL const real4* RESTRICT posq, GLOBAL const unsigned int* RESTRICT exclusions,
GLOBAL const int2* exclusionTiles, int needEnergy,
......@@ -100,17 +91,11 @@ KERNEL void computeN2Energy(
// Write results.
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = atom1;
ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) realToFixedPoint(force.x));
ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.z));
STORE_DERIVATIVES_1
#else
unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz += force.xyz;
STORE_DERIVATIVES_1
#endif
}
}
else {
......@@ -174,33 +159,21 @@ KERNEL void computeN2Energy(
// Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = atom1;
ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) realToFixedPoint(force.x));
ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.z));
STORE_DERIVATIVES_1
#else
unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz += force.xyz;
STORE_DERIVATIVES_1
#endif
}
// Write results.
for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = y*TILE_SIZE+tgx;
ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) realToFixedPoint(local_force[tgx].x));
ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(local_force[tgx].y));
ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(local_force[tgx].z));
STORE_DERIVATIVES_2
#else
unsigned int offset = y*TILE_SIZE+tgx + GROUP_ID*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz += local_force[tgx].xyz;
STORE_DERIVATIVES_2
#endif
}
}
}
......@@ -316,17 +289,11 @@ KERNEL void computeN2Energy(
// Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = atom1;
ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) realToFixedPoint(force.x));
ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.z));
STORE_DERIVATIVES_1
#else
unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz += force.xyz;
STORE_DERIVATIVES_1
#endif
}
}
else
......@@ -375,17 +342,11 @@ KERNEL void computeN2Energy(
// Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = atom1;
ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) realToFixedPoint(force.x));
ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.z));
STORE_DERIVATIVES_1
#else
unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz += force.xyz;
STORE_DERIVATIVES_1
#endif
}
}
......@@ -398,17 +359,11 @@ KERNEL void computeN2Energy(
unsigned int atom2 = y*TILE_SIZE + tgx;
#endif
if (atom2 < PADDED_NUM_ATOMS) {
#ifdef SUPPORTS_64_BIT_ATOMICS
ATOMIC_ADD(&forceBuffers[atom2], (mm_ulong) realToFixedPoint(local_force[tgx].x));
ATOMIC_ADD(&forceBuffers[atom2+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(local_force[tgx].y));
ATOMIC_ADD(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(local_force[tgx].z));
unsigned int offset = atom2;
STORE_DERIVATIVES_2
#else
unsigned int offset = atom2 + GROUP_ID*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz += local_force[tgx].xyz;
STORE_DERIVATIVES_2
#endif
}
}
}
......
......@@ -10,20 +10,13 @@
*/
KERNEL void computePerParticleEnergy(GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq,
#ifdef SUPPORTS_64_BIT_ATOMICS
GLOBAL mm_long* RESTRICT forceBuffers
#else
GLOBAL real4* RESTRICT forceBuffers, int bufferSize, int numBuffers
#endif
PARAMETER_ARGUMENTS) {
mixed energy = 0;
INIT_PARAM_DERIVS
for (int index = GLOBAL_ID; index < NUM_ATOMS; index += GLOBAL_SIZE) {
// Reduce the derivatives
#ifndef SUPPORTS_64_BIT_ATOMICS
int totalSize = bufferSize*numBuffers;
#endif
REDUCE_DERIVATIVES
// Now calculate the per-particle energy terms.
......
......@@ -3,29 +3,17 @@
*/
KERNEL void computeGradientChainRuleTerms(GLOBAL const real4* RESTRICT posq,
#ifdef SUPPORTS_64_BIT_ATOMICS
GLOBAL mm_long* RESTRICT forceBuffers
#else
GLOBAL real4* RESTRICT forceBuffers
#endif
PARAMETER_ARGUMENTS) {
INIT_PARAM_DERIVS
const real scale = RECIP((real) 0x100000000);
for (int index = GLOBAL_ID; index < NUM_ATOMS; index += GLOBAL_SIZE) {
real4 pos = posq[index];
#ifdef SUPPORTS_64_BIT_ATOMICS
real3 force = make_real3(scale*forceBuffers[index], scale*forceBuffers[index+PADDED_NUM_ATOMS], scale*forceBuffers[index+PADDED_NUM_ATOMS*2]);
#else
real3 force = trimTo3(forceBuffers[index]);
#endif
COMPUTE_FORCES
#ifdef SUPPORTS_64_BIT_ATOMICS
forceBuffers[index] = realToFixedPoint(force.x);
forceBuffers[index+PADDED_NUM_ATOMS] = realToFixedPoint(force.y);
forceBuffers[index+PADDED_NUM_ATOMS*2] = realToFixedPoint(force.z);
#else
forceBuffers[index] = make_real4(force.x, force.y, force.z, 0);
#endif
}
SAVE_PARAM_DERIVS
}
......@@ -3,11 +3,7 @@
*/
KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsigned int* RESTRICT exclusions,
GLOBAL const int2* exclusionTiles,
#ifdef SUPPORTS_64_BIT_ATOMICS
GLOBAL mm_ulong* RESTRICT global_value,
#else
GLOBAL real* RESTRICT global_value,
#endif
#ifdef USE_CUTOFF
GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, GLOBAL const real4* RESTRICT blockCenter,
......@@ -137,7 +133,6 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig
// Write results.
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset1 = x*TILE_SIZE + tgx;
ATOMIC_ADD(&global_value[offset1], (mm_ulong) realToFixedPoint(value));
STORE_PARAM_DERIVS1
......@@ -146,16 +141,6 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig
ATOMIC_ADD(&global_value[offset2], (mm_ulong) realToFixedPoint(local_value[LOCAL_ID]));
STORE_PARAM_DERIVS2
}
#else
unsigned int offset1 = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
global_value[offset1] += value;
STORE_PARAM_DERIVS1
if (x != y) {
global_value[offset2] += local_value[LOCAL_ID];
STORE_PARAM_DERIVS2
}
#endif
}
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
......@@ -317,7 +302,6 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig
#else
unsigned int atom2 = y*TILE_SIZE + tgx;
#endif
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset1 = atom1;
ATOMIC_ADD(&global_value[offset1], (mm_ulong) realToFixedPoint(value));
STORE_PARAM_DERIVS1
......@@ -326,16 +310,6 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig
ATOMIC_ADD(&global_value[offset2], (mm_ulong) realToFixedPoint(local_value[LOCAL_ID]));
STORE_PARAM_DERIVS2
}
#else
unsigned int offset1 = atom1 + warp*PADDED_NUM_ATOMS;
global_value[offset1] += value;
STORE_PARAM_DERIVS1
if (atom2 < PADDED_NUM_ATOMS) {
unsigned int offset2 = atom2 + warp*PADDED_NUM_ATOMS;
global_value[offset2] += local_value[LOCAL_ID];
STORE_PARAM_DERIVS2
}
#endif
}
pos++;
}
......
......@@ -3,11 +3,7 @@
*/
KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsigned int* RESTRICT exclusions,
GLOBAL const int2* exclusionTiles,
#ifdef SUPPORTS_64_BIT_ATOMICS
GLOBAL mm_ulong* RESTRICT global_value,
#else
GLOBAL real* RESTRICT global_value,
#endif
#ifdef USE_CUTOFF
GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, GLOBAL const real4* RESTRICT blockCenter,
......@@ -84,13 +80,8 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig
// Write results.
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset1 = atom1;
ATOMIC_ADD(&global_value[offset1], (mm_ulong) realToFixedPoint(value));
#else
unsigned int offset1 = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
global_value[offset1] += value;
#endif
STORE_PARAM_DERIVS1
}
}
......@@ -146,26 +137,16 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig
// Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset1 = atom1;
ATOMIC_ADD(&global_value[offset1], (mm_ulong) realToFixedPoint(value));
#else
unsigned int offset1 = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
global_value[offset1] += value;
#endif
STORE_PARAM_DERIVS1
}
// Write results.
for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset2 = y*TILE_SIZE+tgx;
ATOMIC_ADD(&global_value[offset2], (mm_ulong) realToFixedPoint(local_value[tgx]));
#else
unsigned int offset2 = y*TILE_SIZE+tgx + get_group_id(0)*PADDED_NUM_ATOMS;
global_value[offset2] += local_value[tgx];
#endif
STORE_PARAM_DERIVS2
}
}
......@@ -273,13 +254,8 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig
// Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset1 = atom1;
ATOMIC_ADD(&global_value[offset1], (mm_ulong) realToFixedPoint(value));
#else
unsigned int offset1 = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
global_value[offset1] += value;
#endif
STORE_PARAM_DERIVS1
}
}
......@@ -322,13 +298,8 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig
// Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset1 = atom1;
ATOMIC_ADD(&global_value[offset1], (mm_ulong) realToFixedPoint(value));
#else
unsigned int offset1 = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
global_value[offset1] += value;
#endif
STORE_PARAM_DERIVS1
}
}
......@@ -342,13 +313,8 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig
unsigned int atom2 = y*TILE_SIZE + tgx;
#endif
if (atom2 < PADDED_NUM_ATOMS) {
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset2 = atom2;
ATOMIC_ADD(&global_value[offset2], (mm_ulong) realToFixedPoint(local_value[tgx]));
#else
unsigned int offset2 = atom2 + get_group_id(0)*PADDED_NUM_ATOMS;
global_value[offset2] += local_value[tgx];
#endif
STORE_PARAM_DERIVS2
}
}
......
......@@ -3,23 +3,12 @@
*/
KERNEL void computePerParticleValues(GLOBAL real4* posq,
#ifdef SUPPORTS_64_BIT_ATOMICS
GLOBAL mm_long* valueBuffers
#else
GLOBAL real* valueBuffers, int bufferSize, int numBuffers
#endif
PARAMETER_ARGUMENTS) {
for (int index = GLOBAL_ID; index < NUM_ATOMS; index += GLOBAL_SIZE) {
// Reduce the pairwise value
#ifdef SUPPORTS_64_BIT_ATOMICS
real sum = valueBuffers[index]/(real) 0x100000000;
#else
int totalSize = bufferSize*numBuffers;
real sum = valueBuffers[index];
for (int i = index+bufferSize; i < totalSize; i += bufferSize)
sum += valueBuffers[i];
#endif
REDUCE_PARAM0_DERIV
// Now calculate other values
......
......@@ -44,11 +44,7 @@ inline DEVICE real4 computeCross(real4 vec1, real4 vec2) {
* Compute forces on donors.
*/
KERNEL void computeDonorForces(
#ifdef SUPPORTS_64_BIT_ATOMICS
GLOBAL mm_ulong* RESTRICT force,
#else
GLOBAL real4* RESTRICT forceBuffers, GLOBAL const int4* RESTRICT donorBufferIndices,
#endif
GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq, GLOBAL const int4* RESTRICT exclusions,
GLOBAL const int4* RESTRICT donorAtoms, GLOBAL const int4* RESTRICT acceptorAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
......@@ -114,7 +110,6 @@ KERNEL void computeDonorForces(
// Write results
if (donorIndex < NUM_DONORS) {
#ifdef SUPPORTS_64_BIT_ATOMICS
if (atoms.x > -1) {
ATOMIC_ADD(&force[atoms.x], (mm_ulong) realToFixedPoint(f1.x));
ATOMIC_ADD(&force[atoms.x+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(f1.y));
......@@ -133,27 +128,6 @@ KERNEL void computeDonorForces(
ATOMIC_ADD(&force[atoms.z+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(f3.z));
MEM_FENCE;
}
#else
int4 bufferIndices = donorBufferIndices[donorIndex];
if (atoms.x > -1) {
unsigned int offset = atoms.x+bufferIndices.x*PADDED_NUM_ATOMS;
real4 force = forceBuffers[offset];
force.xyz += f1.xyz;
forceBuffers[offset] = force;
}
if (atoms.y > -1) {
unsigned int offset = atoms.y+bufferIndices.y*PADDED_NUM_ATOMS;
real4 force = forceBuffers[offset];
force.xyz += f2.xyz;
forceBuffers[offset] = force;
}
if (atoms.z > -1) {
unsigned int offset = atoms.z+bufferIndices.z*PADDED_NUM_ATOMS;
real4 force = forceBuffers[offset];
force.xyz += f3.xyz;
forceBuffers[offset] = force;
}
#endif
}
}
energyBuffer[GLOBAL_ID] += energy;
......@@ -162,11 +136,7 @@ KERNEL void computeDonorForces(
* Compute forces on acceptors.
*/
KERNEL void computeAcceptorForces(
#ifdef SUPPORTS_64_BIT_ATOMICS
GLOBAL mm_ulong* RESTRICT force,
#else
GLOBAL real4* RESTRICT forceBuffers, GLOBAL const int4* RESTRICT acceptorBufferIndices,
#endif
GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq, GLOBAL const int4* RESTRICT exclusions,
GLOBAL const int4* RESTRICT donorAtoms, GLOBAL const int4* RESTRICT acceptorAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
......@@ -231,7 +201,6 @@ KERNEL void computeAcceptorForces(
// Write results
if (acceptorIndex < NUM_ACCEPTORS) {
#ifdef SUPPORTS_64_BIT_ATOMICS
if (atoms.x > -1) {
ATOMIC_ADD(&force[atoms.x], (mm_ulong) realToFixedPoint(f1.x));
ATOMIC_ADD(&force[atoms.x+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(f1.y));
......@@ -250,27 +219,6 @@ KERNEL void computeAcceptorForces(
ATOMIC_ADD(&force[atoms.z+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(f3.z));
MEM_FENCE;
}
#else
int4 bufferIndices = acceptorBufferIndices[acceptorIndex];
if (atoms.x > -1) {
unsigned int offset = atoms.x+bufferIndices.x*PADDED_NUM_ATOMS;
real4 force = forceBuffers[offset];
force.xyz += f1.xyz;
forceBuffers[offset] = force;
}
if (atoms.y > -1) {
unsigned int offset = atoms.y+bufferIndices.y*PADDED_NUM_ATOMS;
real4 force = forceBuffers[offset];
force.xyz += f2.xyz;
forceBuffers[offset] = force;
}
if (atoms.z > -1) {
unsigned int offset = atoms.z+bufferIndices.z*PADDED_NUM_ATOMS;
real4 force = forceBuffers[offset];
force.xyz += f3.xyz;
forceBuffers[offset] = force;
}
#endif
}
}
}
......@@ -35,38 +35,8 @@ DEVICE int reduceMax(int val, LOCAL_ARG int* temp) {
#endif
}
#ifndef SUPPORTS_64_BIT_ATOMICS
/**
* This function is used on devices that don't support 64 bit atomics. Multiple threads within
* a single tile might have computed forces on the same atom. This loops over them and makes sure
* that only one thread updates the force on any given atom.
*/
void writeForces(GLOBAL real4* forceBuffers, LOCAL AtomData* localData, int atomIndex) {
localData[LOCAL_ID].x = atomIndex;
SYNC_WARPS;
real4 forceSum = make_real4(0);
int start = (LOCAL_ID/TILE_SIZE)*TILE_SIZE;
int end = start+32;
bool isFirst = true;
for (int i = start; i < end; i++)
if (localData[i].x == atomIndex) {
forceSum += (real4) (localData[i].fx, localData[i].fy, localData[i].fz, 0);
isFirst &= (i >= LOCAL_ID);
}
const unsigned int warp = GLOBAL_ID/TILE_SIZE;
unsigned int offset = atomIndex + warp*PADDED_NUM_ATOMS;
if (isFirst)
forceBuffers[offset] += forceSum;
SYNC_WARPS;
}
#endif
KERNEL void computeInteractionGroups(
#ifdef SUPPORTS_64_BIT_ATOMICS
GLOBAL mm_ulong* RESTRICT forceBuffers,
#else
GLOBAL real4* RESTRICT forceBuffers,
#endif
GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq, GLOBAL const int4* RESTRICT groupData,
GLOBAL const int* RESTRICT numGroupTiles, int useNeighborList,
real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
......@@ -139,7 +109,6 @@ KERNEL void computeInteractionGroups(
}
SYNC_WARPS;
}
#ifdef SUPPORTS_64_BIT_ATOMICS
if (exclusions != 0) {
ATOMIC_ADD(&forceBuffers[atom1], (mm_ulong) realToFixedPoint(force.x));
ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
......@@ -149,13 +118,6 @@ KERNEL void computeInteractionGroups(
ATOMIC_ADD(&forceBuffers[atom2+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(localData[LOCAL_ID].fy));
ATOMIC_ADD(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(localData[LOCAL_ID].fz));
SYNC_WARPS;
#else
writeForces(forceBuffers, localData, atom2);
localData[LOCAL_ID].fx = force.x;
localData[LOCAL_ID].fy = force.y;
localData[LOCAL_ID].fz = force.z;
writeForces(forceBuffers, localData, atom1);
#endif
}
energyBuffer[GLOBAL_ID] += energy;
SAVE_DERIVATIVES
......
......@@ -17,11 +17,7 @@ typedef struct ALIGN {
* Compute the Born sum.
*/
KERNEL void computeBornSum(
#ifdef SUPPORTS_64_BIT_ATOMICS
GLOBAL mm_ulong* RESTRICT global_bornSum,
#else
GLOBAL real* RESTRICT global_bornSum,
#endif
GLOBAL const real4* RESTRICT posq, GLOBAL const real* RESTRICT charge, GLOBAL const float2* RESTRICT global_params,
#ifdef USE_CUTOFF
GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
......@@ -152,20 +148,12 @@ KERNEL void computeBornSum(
// Write results.
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = x*TILE_SIZE + tgx;
ATOMIC_ADD(&global_bornSum[offset], (mm_ulong) realToFixedPoint(bornSum));
if (x != y) {
offset = y*TILE_SIZE + tgx;
ATOMIC_ADD(&global_bornSum[offset], (mm_ulong) realToFixedPoint(localData[LOCAL_ID].bornSum));
}
#else
unsigned int offset1 = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
global_bornSum[offset1] += bornSum;
if (x != y)
global_bornSum[offset2] += localData[LOCAL_ID].bornSum;
#endif
}
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
......@@ -357,17 +345,9 @@ KERNEL void computeBornSum(
#else
unsigned int atom2 = y*TILE_SIZE + tgx;
#endif
#ifdef SUPPORTS_64_BIT_ATOMICS
ATOMIC_ADD(&global_bornSum[atom1], (mm_ulong) realToFixedPoint(bornSum));
if (atom2 < PADDED_NUM_ATOMS)
ATOMIC_ADD(&global_bornSum[atom2], (mm_ulong) realToFixedPoint(localData[LOCAL_ID].bornSum));
#else
unsigned int offset1 = atom1 + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = atom2 + warp*PADDED_NUM_ATOMS;
global_bornSum[offset1] += bornSum;
if (atom2 < PADDED_NUM_ATOMS)
global_bornSum[offset2] += localData[LOCAL_ID].bornSum;
#endif
}
pos++;
}
......@@ -385,11 +365,7 @@ typedef struct ALIGN {
*/
KERNEL void computeGBSAForce1(
#ifdef SUPPORTS_64_BIT_ATOMICS
GLOBAL mm_ulong* RESTRICT forceBuffers, GLOBAL mm_ulong* RESTRICT global_bornForce,
#else
GLOBAL real4* RESTRICT forceBuffers, GLOBAL real* RESTRICT global_bornForce,
#endif
GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq, GLOBAL const real* RESTRICT charge,
GLOBAL const real* RESTRICT global_bornRadii, int needEnergy,
#ifdef USE_CUTOFF
......@@ -538,7 +514,6 @@ KERNEL void computeGBSAForce1(
// Write results.
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = x*TILE_SIZE + tgx;
ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) realToFixedPoint(force.x));
ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
......@@ -551,16 +526,6 @@ KERNEL void computeGBSAForce1(
ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(localData[LOCAL_ID].fz));
ATOMIC_ADD(&global_bornForce[offset], (mm_ulong) realToFixedPoint(localData[LOCAL_ID].fw));
}
#else
unsigned int offset1 = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
forceBuffers[offset1] += make_real4(force.x, force.y, force.z, 0);
global_bornForce[offset1] += force.w;
if (x != y) {
forceBuffers[offset2] += (real4) (localData[LOCAL_ID].fx, localData[LOCAL_ID].fy, localData[LOCAL_ID].fz, 0.0f);
global_bornForce[offset2] += localData[LOCAL_ID].fw;
}
#endif
}
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
......@@ -763,7 +728,6 @@ KERNEL void computeGBSAForce1(
#else
unsigned int atom2 = y*TILE_SIZE + tgx;
#endif
#ifdef SUPPORTS_64_BIT_ATOMICS
ATOMIC_ADD(&forceBuffers[atom1], (mm_ulong) realToFixedPoint(force.x));
ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.z));
......@@ -774,16 +738,6 @@ KERNEL void computeGBSAForce1(
ATOMIC_ADD(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(localData[LOCAL_ID].fz));
ATOMIC_ADD(&global_bornForce[atom2], (mm_ulong) realToFixedPoint(localData[LOCAL_ID].fw));
}
#else
unsigned int offset1 = atom1 + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = atom2 + warp*PADDED_NUM_ATOMS;
forceBuffers[offset1] += make_real4(force.x, force.y, force.z, 0);
global_bornForce[offset1] += force.w;
if (atom2 < PADDED_NUM_ATOMS) {
forceBuffers[offset2] += (real4) (localData[LOCAL_ID].fx, localData[LOCAL_ID].fy, localData[LOCAL_ID].fz, 0.0f);
global_bornForce[offset2] += localData[LOCAL_ID].fw;
}
#endif
}
pos++;
}
......
......@@ -16,13 +16,8 @@
real t2I = (l_ij2I-u_ij2I);
real term1 = (0.5f*(0.25f+OBC_PARAMS2.y*OBC_PARAMS2.y*invRSquaredOver4)*t2J + t1J*invRSquaredOver4)*invR;
real term2 = (0.5f*(0.25f+OBC_PARAMS1.y*OBC_PARAMS1.y*invRSquaredOver4)*t2I + t1I*invRSquaredOver4)*invR;
#ifdef SUPPORTS_64_BIT_ATOMICS
real tempdEdR = (OBC_PARAMS1.x < rScaledRadiusJ ? BORN_FORCE1*term1/0x100000000 : 0);
tempdEdR += (OBC_PARAMS2.x < rScaledRadiusI ? BORN_FORCE2*term2/0x100000000 : 0);
#else
real tempdEdR = (OBC_PARAMS1.x < rScaledRadiusJ ? BORN_FORCE1*term1 : (real) 0);
tempdEdR += (OBC_PARAMS2.x < rScaledRadiusI ? BORN_FORCE2*term2 : (real) 0);
#endif
#ifdef USE_CUTOFF
unsigned int includeInteraction = (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2 && r2 < CUTOFF_SQUARED);
#else
......
......@@ -6,23 +6,12 @@
*/
KERNEL void reduceBornSum(float alpha, float beta, float gamma,
#ifdef SUPPORTS_64_BIT_ATOMICS
GLOBAL const mm_long* RESTRICT bornSum,
#else
GLOBAL const real* RESTRICT bornSum, int bufferSize, int numBuffers,
#endif
GLOBAL const float2* RESTRICT params, GLOBAL real* RESTRICT bornRadii, GLOBAL real* RESTRICT obcChain) {
for (unsigned int index = GLOBAL_ID; index < NUM_ATOMS; index += GLOBAL_SIZE) {
// Get summed Born data
#ifdef SUPPORTS_64_BIT_ATOMICS
real sum = RECIP((real) 0x100000000)*bornSum[index];
#else
real sum = bornSum[index];
int totalSize = bufferSize*numBuffers;
for (int i = index+bufferSize; i < totalSize; i += bufferSize)
sum += bornSum[i];
#endif
// Now calculate Born radius and OBC term.
......@@ -45,24 +34,14 @@ KERNEL void reduceBornSum(float alpha, float beta, float gamma,
*/
KERNEL void reduceBornForce(
#ifdef SUPPORTS_64_BIT_ATOMICS
GLOBAL mm_long* RESTRICT bornForce,
#else
GLOBAL real* bornForce, int bufferSize, int numBuffers,
#endif
GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const float2* RESTRICT params, GLOBAL const real* RESTRICT bornRadii, GLOBAL const real* RESTRICT obcChain) {
mixed energy = 0;
for (unsigned int index = GLOBAL_ID; index < NUM_ATOMS; index += GLOBAL_SIZE) {
// Get summed Born force
#ifdef SUPPORTS_64_BIT_ATOMICS
real force = RECIP((real) 0x100000000)*bornForce[index];
#else
real force = bornForce[index];
int totalSize = bufferSize*numBuffers;
for (int i = index+bufferSize; i < totalSize; i += bufferSize)
force += bornForce[i];
#endif
// Now calculate the actual force
float offsetRadius = params[index].x;
......@@ -73,11 +52,7 @@ KERNEL void reduceBornForce(
force += saTerm/bornRadius;
energy += saTerm;
force *= bornRadius*bornRadius*obcChain[index];
#ifdef SUPPORTS_64_BIT_ATOMICS
bornForce[index] = realToFixedPoint(force);
#else
bornForce[index] = force;
#endif
}
energyBuffer[GLOBAL_ID] += energy/-6;
}
......@@ -9,11 +9,7 @@ typedef struct {
* Compute the Born sum.
*/
KERNEL void computeBornSum(
#ifdef SUPPORTS_64_BIT_ATOMICS
GLOBAL mm_long* RESTRICT global_bornSum,
#else
GLOBAL real* RESTRICT global_bornSum,
#endif
GLOBAL const real4* RESTRICT posq, GLOBAL const real* RESTRICT charge, GLOBAL const float2* RESTRICT global_params,
#ifdef USE_CUTOFF
GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
......@@ -87,12 +83,7 @@ KERNEL void computeBornSum(
// Write results.
#ifdef SUPPORTS_64_BIT_ATOMICS
ATOMIC_ADD(&global_bornSum[atom1], realToFixedPoint(bornSum));
#else
unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
global_bornSum[offset] += bornSum;
#endif
}
}
else {
......@@ -149,24 +140,14 @@ KERNEL void computeBornSum(
// Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS
ATOMIC_ADD(&global_bornSum[atom1], realToFixedPoint(bornSum));
#else
unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
global_bornSum[offset] += bornSum;
#endif
}
// Write results.
for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = y*TILE_SIZE + tgx;
ATOMIC_ADD(&global_bornSum[offset], realToFixedPoint(localData[tgx].bornSum));
#else
unsigned int offset = y*TILE_SIZE+tgx + GROUP_ID*PADDED_NUM_ATOMS;
global_bornSum[offset] += localData[tgx].bornSum;
#endif
}
}
}
......@@ -296,12 +277,7 @@ KERNEL void computeBornSum(
// Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS
ATOMIC_ADD(&global_bornSum[atom1], realToFixedPoint(bornSum));
#else
unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
global_bornSum[offset] += bornSum;
#endif
}
}
else
......@@ -359,12 +335,7 @@ KERNEL void computeBornSum(
// Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS
ATOMIC_ADD(&global_bornSum[atom1], realToFixedPoint(bornSum));
#else
unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
global_bornSum[offset] += bornSum;
#endif
}
}
......@@ -377,12 +348,7 @@ KERNEL void computeBornSum(
unsigned int atom2 = y*TILE_SIZE + tgx;
#endif
if (atom2 < PADDED_NUM_ATOMS) {
#ifdef SUPPORTS_64_BIT_ATOMICS
ATOMIC_ADD(&global_bornSum[atom2], realToFixedPoint(localData[tgx].bornSum));
#else
unsigned int offset = atom2 + GROUP_ID*PADDED_NUM_ATOMS;
global_bornSum[offset] += localData[tgx].bornSum;
#endif
}
}
}
......@@ -402,11 +368,7 @@ typedef struct {
*/
KERNEL void computeGBSAForce1(
#ifdef SUPPORTS_64_BIT_ATOMICS
GLOBAL mm_long* RESTRICT forceBuffers, GLOBAL mm_long* RESTRICT global_bornForce,
#else
GLOBAL real4* RESTRICT forceBuffers, GLOBAL real* RESTRICT global_bornForce,
#endif
GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq, GLOBAL const real* RESTRICT charge,
GLOBAL const real* RESTRICT global_bornRadii, int needEnergy,
#ifdef USE_CUTOFF
......@@ -490,16 +452,10 @@ KERNEL void computeGBSAForce1(
// Write results.
#ifdef SUPPORTS_64_BIT_ATOMICS
ATOMIC_ADD(&forceBuffers[atom1], realToFixedPoint(force.x));
ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], realToFixedPoint(force.y));
ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], realToFixedPoint(force.z));
ATOMIC_ADD(&global_bornForce[atom1], realToFixedPoint(force.w));
#else
unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
forceBuffers[offset] += make_real4(force.x, force.y, force.z, 0);
global_bornForce[offset] += force.w;
#endif
}
}
else {
......@@ -561,36 +517,20 @@ KERNEL void computeGBSAForce1(
// Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS
ATOMIC_ADD(&forceBuffers[atom1], realToFixedPoint(force.x));
ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], realToFixedPoint(force.y));
ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], realToFixedPoint(force.z));
ATOMIC_ADD(&global_bornForce[atom1], realToFixedPoint(force.w));
#else
unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
forceBuffers[offset] += make_real4(force.x, force.y, force.z, 0);
global_bornForce[offset] += force.w;
#endif
}
// Write results.
for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = y*TILE_SIZE + tgx;
ATOMIC_ADD(&forceBuffers[offset], realToFixedPoint(localData[tgx].fx));
ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], realToFixedPoint(localData[tgx].fy));
ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], realToFixedPoint(localData[tgx].fz));
ATOMIC_ADD(&global_bornForce[offset], realToFixedPoint(localData[tgx].fw));
#else
unsigned int offset = y*TILE_SIZE+tgx + GROUP_ID*PADDED_NUM_ATOMS;
real4 f = forceBuffers[offset];
f.x += localData[tgx].fx;
f.y += localData[tgx].fy;
f.z += localData[tgx].fz;
forceBuffers[offset] = f;
global_bornForce[offset] += localData[tgx].fw;
#endif
}
}
}
......@@ -722,16 +662,10 @@ KERNEL void computeGBSAForce1(
// Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS
ATOMIC_ADD(&forceBuffers[atom1], realToFixedPoint(force.x));
ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], realToFixedPoint(force.y));
ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], realToFixedPoint(force.z));
ATOMIC_ADD(&global_bornForce[atom1], realToFixedPoint(force.w));
#else
unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
forceBuffers[offset] += make_real4(force.x, force.y, force.z, 0);
global_bornForce[offset] += force.w;
#endif
}
}
else
......@@ -790,16 +724,10 @@ KERNEL void computeGBSAForce1(
// Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS
ATOMIC_ADD(&forceBuffers[atom1], realToFixedPoint(force.x));
ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], realToFixedPoint(force.y));
ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], realToFixedPoint(force.z));
ATOMIC_ADD(&global_bornForce[atom1], realToFixedPoint(force.w));
#else
unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
forceBuffers[offset] += make_real4(force.x, force.y, force.z, 0);
global_bornForce[offset] += force.w;
#endif
}
}
......@@ -812,20 +740,10 @@ KERNEL void computeGBSAForce1(
unsigned int atom2 = y*TILE_SIZE + tgx;
#endif
if (atom2 < PADDED_NUM_ATOMS) {
#ifdef SUPPORTS_64_BIT_ATOMICS
ATOMIC_ADD(&forceBuffers[atom2], realToFixedPoint(localData[tgx].fx));
ATOMIC_ADD(&forceBuffers[atom2+PADDED_NUM_ATOMS], realToFixedPoint(localData[tgx].fy));
ATOMIC_ADD(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], realToFixedPoint(localData[tgx].fz));
ATOMIC_ADD(&global_bornForce[atom2], realToFixedPoint(localData[tgx].fw));
#else
unsigned int offset = atom2 + GROUP_ID*PADDED_NUM_ATOMS;
real4 f = forceBuffers[offset];
f.x += localData[tgx].fx;
f.y += localData[tgx].fy;
f.z += localData[tgx].fz;
forceBuffers[offset] = f;
global_bornForce[offset] += localData[tgx].fw;
#endif
}
}
}
......
KERNEL void findAtomGridIndex(GLOBAL const real4* RESTRICT posq, GLOBAL int2* RESTRICT pmeAtomGridIndex,
real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ
#ifndef SUPPORTS_64_BIT_ATOMICS
, GLOBAL real4* RESTRICT pmeBsplineTheta, LOCAL real4* RESTRICT bsplinesCache,
#ifdef CHARGE_FROM_SIGEPS
GLOBAL const float2* RESTRICT sigmaEpsilon
#else
GLOBAL const real* RESTRICT charges
#endif
#endif
) {
// Compute the index of the grid point each atom is associated with.
......@@ -25,42 +17,9 @@ KERNEL void findAtomGridIndex(GLOBAL const real4* RESTRICT posq, GLOBAL int2* RE
((int) t.y) % GRID_SIZE_Y,
((int) t.z) % GRID_SIZE_Z);
pmeAtomGridIndex[atom] = make_int2(atom, gridIndex.x*GRID_SIZE_Y*GRID_SIZE_Z+gridIndex.y*GRID_SIZE_Z+gridIndex.z);
#ifndef SUPPORTS_64_BIT_ATOMICS
// Compute B-splines here for use in the charge spreading kernel.
const real4 scale = 1/(real) (PME_ORDER-1);
LOCAL real4* data = &bsplinesCache[LOCAL_ID*PME_ORDER];
real4 dr = (real4) (t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z, 0.0f);
data[PME_ORDER-1] = 0.0f;
data[1] = dr;
data[0] = 1.0f-dr;
for (int j = 3; j < PME_ORDER; j++) {
real div = RECIP(j-1.0f);
data[j-1] = div*dr*data[j-2];
for (int k = 1; k < (j-1); k++)
data[j-k-1] = div*((dr+make_real4(k))*data[j-k-2] + (-dr+make_real4(j-k))*data[j-k-1]);
data[0] = div*(- dr+1.0f)*data[0];
}
data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
for (int j = 1; j < (PME_ORDER-1); j++)
data[PME_ORDER-j-1] = scale*((dr+make_real4(j))*data[PME_ORDER-j-2] + (-dr+make_real4(PME_ORDER-j))*data[PME_ORDER-j-1]);
data[0] = scale*(-dr+1.0f)*data[0];
for (int j = 0; j < PME_ORDER; j++) {
#ifdef CHARGE_FROM_SIGEPS
const float2 sigEps = sigmaEpsilon[atom];
const real charge = 8*sigEps.x*sigEps.x*sigEps.x*sigEps.y;
#else
const real charge = CHARGE;
#endif
data[j].w = charge; // Storing the charge here improves cache coherency in the charge spreading kernel
pmeBsplineTheta[atom+j*NUM_ATOMS] = data[j];
}
#endif
}
}
#ifdef SUPPORTS_64_BIT_ATOMICS
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#if defined(USE_HIP) && !defined(AMD_RDNA)
LAUNCH_BOUNDS_EXACT(128, 1)
#endif
......@@ -206,197 +165,6 @@ KERNEL void finishSpreadCharge(
#endif
}
}
#elif defined(DEVICE_IS_CPU)
KERNEL void gridSpreadCharge(GLOBAL const real4* RESTRICT posq, GLOBAL real* RESTRICT pmeGrid,
real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ,
#ifdef CHARGE_FROM_SIGEPS
GLOBAL const float2* RESTRICT sigmaEpsilon
#else
GLOBAL const real* RESTRICT charges
#endif
) {
const int firstx = GLOBAL_ID*GRID_SIZE_X/GLOBAL_SIZE;
const int lastx = (GLOBAL_ID+1)*GRID_SIZE_X/GLOBAL_SIZE;
if (firstx == lastx)
return;
const real4 scale = 1/(real) (PME_ORDER-1);
real4 data[PME_ORDER];
// Process the atoms in spatially sorted order. This improves efficiency when writing
// the grid values.
for (int i = 0; i < NUM_ATOMS; i++) {
int atom = i;
real4 pos = posq[atom];
APPLY_PERIODIC_TO_POS(pos)
real3 t = (real3) (pos.x*recipBoxVecX.x+pos.y*recipBoxVecY.x+pos.z*recipBoxVecZ.x,
pos.y*recipBoxVecY.y+pos.z*recipBoxVecZ.y,
pos.z*recipBoxVecZ.z);
t.x = (t.x-floor(t.x))*GRID_SIZE_X;
t.y = (t.y-floor(t.y))*GRID_SIZE_Y;
t.z = (t.z-floor(t.z))*GRID_SIZE_Z;
int4 gridIndex = (int4) (((int) t.x) % GRID_SIZE_X,
((int) t.y) % GRID_SIZE_Y,
((int) t.z) % GRID_SIZE_Z, 0);
// Spread the charge from this atom onto each grid point.
#ifdef CHARGE_FROM_SIGEPS
const float2 sigEps = sigmaEpsilon[atom];
const real charge = 8*sigEps.x*sigEps.x*sigEps.x*sigEps.y;
#else
const real charge = (CHARGE)*EPSILON_FACTOR;
#endif
if (charge == 0)
continue;
bool hasComputedThetas = false;
for (int ix = 0; ix < PME_ORDER; ix++) {
int xindex = gridIndex.x+ix;
xindex -= (xindex >= GRID_SIZE_X ? GRID_SIZE_X : 0);
if (xindex < firstx || xindex >= lastx)
continue;
if (!hasComputedThetas) {
hasComputedThetas = true;
// Since we need the full set of thetas, it's faster to compute them here than load them
// from global memory.
real4 dr = (real4) (t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z, 0.0f);
data[PME_ORDER-1] = 0.0f;
data[1] = dr;
data[0] = 1.0f-dr;
for (int j = 3; j < PME_ORDER; j++) {
real div = RECIP(j-1.0f);
data[j-1] = div*dr*data[j-2];
for (int k = 1; k < (j-1); k++)
data[j-k-1] = div*((dr+(real4) k) *data[j-k-2] + (-dr+(real4) (j-k))*data[j-k-1]);
data[0] = div*(- dr+1.0f)*data[0];
}
data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
for (int j = 1; j < (PME_ORDER-1); j++)
data[PME_ORDER-j-1] = scale*((dr+(real4) j)*data[PME_ORDER-j-2] + (-dr+(real4) (PME_ORDER-j))*data[PME_ORDER-j-1]);
data[0] = scale*(-dr+1.0f)*data[0];
}
for (int iy = 0; iy < PME_ORDER; iy++) {
int yindex = gridIndex.y+iy;
yindex -= (yindex >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
for (int iz = 0; iz < PME_ORDER; iz++) {
int zindex = gridIndex.z+iz;
zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
pmeGrid[index] += charge*data[ix].x*data[iy].y*data[iz].z;
}
}
}
}
}
#else
/**
* For each grid point, find the range of sorted atoms associated with that point.
*/
KERNEL void findAtomRangeForGrid(GLOBAL int2* RESTRICT pmeAtomGridIndex, GLOBAL int* RESTRICT pmeAtomRange, GLOBAL const real4* RESTRICT posq) {
int start = (NUM_ATOMS*GLOBAL_ID)/GLOBAL_SIZE;
int end = (NUM_ATOMS*(GLOBAL_ID+1))/GLOBAL_SIZE;
int last = (start == 0 ? -1 : pmeAtomGridIndex[start-1].y);
for (int i = start; i < end; ++i) {
int2 atomData = pmeAtomGridIndex[i];
int gridIndex = atomData.y;
if (gridIndex != last) {
for (int j = last+1; j <= gridIndex; ++j)
pmeAtomRange[j] = i;
last = gridIndex;
}
}
// Fill in values beyond the last atom.
if (GLOBAL_ID == GLOBAL_SIZE-1) {
int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
for (int j = last+1; j <= gridSize; ++j)
pmeAtomRange[j] = NUM_ATOMS;
}
}
/**
* The grid index won't be needed again. Reuse that component to hold the z index, thus saving
* some work in the charge spreading kernel.
*/
KERNEL void recordZIndex(GLOBAL int2* RESTRICT pmeAtomGridIndex, GLOBAL const real4* RESTRICT posq, real4 periodicBoxSize, real4 recipBoxVecZ) {
int start = (NUM_ATOMS*GLOBAL_ID)/GLOBAL_SIZE;
int end = (NUM_ATOMS*(GLOBAL_ID+1))/GLOBAL_SIZE;
for (int i = start; i < end; ++i) {
real posz = posq[pmeAtomGridIndex[i].x].z;
posz -= floor(posz*recipBoxVecZ.z)*periodicBoxSize.z;
int z = ((int) ((posz*recipBoxVecZ.z)*GRID_SIZE_Z)) % GRID_SIZE_Z;
pmeAtomGridIndex[i].y = z;
}
}
KERNEL void gridSpreadCharge(GLOBAL const real4* RESTRICT posq, GLOBAL real* RESTRICT pmeGrid,
GLOBAL const int2* RESTRICT pmeAtomGridIndex, GLOBAL const int* RESTRICT pmeAtomRange,
GLOBAL const real4* RESTRICT pmeBsplineTheta
#ifdef CHARGE_FROM_SIGEPS
, GLOBAL const float2* RESTRICT sigmaEpsilon
#else
, GLOBAL const real* RESTRICT charges
#endif
) {
unsigned int numGridPoints = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
for (int gridIndex = GLOBAL_ID; gridIndex < numGridPoints; gridIndex += GLOBAL_SIZE) {
// Compute the charge on a grid point.
int4 gridPoint;
gridPoint.x = gridIndex/(GRID_SIZE_Y*GRID_SIZE_Z);
int remainder = gridIndex-gridPoint.x*GRID_SIZE_Y*GRID_SIZE_Z;
gridPoint.y = remainder/GRID_SIZE_Z;
gridPoint.z = remainder-gridPoint.y*GRID_SIZE_Z;
real result = 0.0f;
// Loop over all atoms that affect this grid point.
for (int ix = 0; ix < PME_ORDER; ++ix) {
int x = gridPoint.x-ix+(gridPoint.x >= ix ? 0 : GRID_SIZE_X);
for (int iy = 0; iy < PME_ORDER; ++iy) {
int y = gridPoint.y-iy+(gridPoint.y >= iy ? 0 : GRID_SIZE_Y);
int z1 = gridPoint.z-PME_ORDER+1;
z1 += (z1 >= 0 ? 0 : GRID_SIZE_Z);
int z2 = (z1 < gridPoint.z ? gridPoint.z : GRID_SIZE_Z-1);
int gridIndex1 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z1;
int gridIndex2 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z2;
int firstAtom = pmeAtomRange[gridIndex1];
int lastAtom = pmeAtomRange[gridIndex2+1];
for (int i = firstAtom; i < lastAtom; ++i)
{
int2 atomData = pmeAtomGridIndex[i];
int atomIndex = atomData.x;
int z = atomData.y;
int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : GRID_SIZE_Z);
real atomCharge = pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].w;
result += atomCharge*pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].x*pmeBsplineTheta[atomIndex+iy*NUM_ATOMS].y*pmeBsplineTheta[atomIndex+iz*NUM_ATOMS].z;
}
if (z1 > gridPoint.z)
{
gridIndex1 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z;
gridIndex2 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+gridPoint.z;
firstAtom = pmeAtomRange[gridIndex1];
lastAtom = pmeAtomRange[gridIndex2+1];
for (int i = firstAtom; i < lastAtom; ++i)
{
int2 atomData = pmeAtomGridIndex[i];
int atomIndex = atomData.x;
int z = atomData.y;
int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : GRID_SIZE_Z);
real atomCharge = pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].w;
result += atomCharge*pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].x*pmeBsplineTheta[atomIndex+iy*NUM_ATOMS].y*pmeBsplineTheta[atomIndex+iz*NUM_ATOMS].z;
}
}
}
}
pmeGrid[gridIndex] = result*EPSILON_FACTOR;
}
}
#endif
KERNEL void reciprocalConvolution(GLOBAL real2* RESTRICT pmeGrid, GLOBAL const real* RESTRICT pmeBsplineModuliX,
GLOBAL const real* RESTRICT pmeBsplineModuliY, GLOBAL const real* RESTRICT pmeBsplineModuliZ,
......
......@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2011-2019 Stanford University and the Authors. *
* Portions copyright (c) 2011-2022 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -133,12 +133,6 @@ public:
* Initialize this object in preparation for a simulation.
*/
void initialize(const System& system);
/**
* Get the number of force buffers required for bonded forces.
*/
int getNumForceBuffers() {
return numForceBuffers;
}
/**
* Compute the bonded interactions.
*
......@@ -148,19 +142,17 @@ public:
private:
std::string createForceSource(int forceIndex, int numBonds, int numAtoms, int group, const std::string& computeForce);
OpenCLContext& context;
std::vector<cl::Kernel> kernels;
cl::Kernel kernel;
std::vector<std::vector<std::vector<int> > > forceAtoms;
std::vector<int> indexWidth;
std::vector<std::string> forceSource;
std::vector<int> forceGroup;
std::vector<std::vector<int> > forceSets;
std::vector<cl::Memory*> arguments;
std::vector<std::string> argTypes;
std::vector<OpenCLArray> atomIndices;
std::vector<OpenCLArray> bufferIndices;
std::vector<std::string> prefixCode;
std::vector<std::string> energyParameterDerivatives;
int numForceBuffers, maxBonds, allGroups;
int maxBonds, allGroups;
bool hasInitializedKernels;
};
......
......@@ -125,7 +125,7 @@ public:
* Get the number of force buffers required for nonbonded forces.
*/
int getNumForceBuffers() const {
return numForceBuffers;
return 1;
}
/**
* Get the number of energy buffers required for nonbonded forces.
......@@ -331,7 +331,7 @@ private:
std::map<int, std::string> groupKernelSource;
double lastCutoff;
bool useCutoff, usePeriodic, deviceIsCpu, anyExclusions, usePadding, forceRebuildNeighborList;
int numForceBuffers, startTileIndex, startBlockIndex, numBlocks, maxExclusions, numForceThreadBlocks;
int startTileIndex, startBlockIndex, numBlocks, maxExclusions, numForceThreadBlocks;
int forceThreadBlockSize, interactingBlocksThreadBlockSize, groupFlags;
unsigned int tilesAfterReorder;
long long numTiles;
......
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2011-2019 Stanford University and the Authors. *
* Portions copyright (c) 2011-2022 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -34,7 +34,7 @@
using namespace OpenMM;
using namespace std;
OpenCLBondedUtilities::OpenCLBondedUtilities(OpenCLContext& context) : context(context), numForceBuffers(0), maxBonds(0), allGroups(0), hasInitializedKernels(false) {
OpenCLBondedUtilities::OpenCLBondedUtilities(OpenCLContext& context) : context(context), maxBonds(0), allGroups(0), hasInitializedKernels(false) {
}
void OpenCLBondedUtilities::addInteraction(const vector<vector<int> >& atoms, const string& source, int group) {
......@@ -85,11 +85,8 @@ void OpenCLBondedUtilities::initialize(const System& system) {
if (numForces == 0)
return;
// Build the lists of atom indices and buffer indices.
// Build the lists of atom indices.
vector<vector<cl_uint> > bufferVec(numForces);
vector<vector<int> > bufferCounter(numForces, vector<int>(system.getNumParticles(), 0));
vector<int> numBuffers(numForces, 0);
atomIndices.resize(numForces);
for (int i = 0; i < numForces; i++) {
int numBonds = forceAtoms[i].size();
......@@ -102,100 +99,17 @@ void OpenCLBondedUtilities::initialize(const System& system) {
}
atomIndices[i].initialize<cl_uint>(context, indexVec.size(), "bondedIndices");
atomIndices[i].upload(indexVec);
bufferVec[i].resize(width*numBonds, 0);
for (int bond = 0; bond < numBonds; bond++) {
for (int atom = 0; atom < numAtoms; atom++)
bufferVec[i][bond*width+atom] = bufferCounter[i][forceAtoms[i][bond][atom]]++;
}
for (int j = 0; j < (int) bufferCounter[i].size(); j++)
numBuffers[i] = max(numBuffers[i], bufferCounter[i][j]);
}
// For efficiency, we want to merge multiple forces into a single kernel - but only if that
// won't increase the number of force buffers.
if (context.getSupports64BitGlobalAtomics()) {
// Put all the forces in the same set.
numForceBuffers = 1;
forceSets.push_back(vector<int>());
for (int i = 0; i < numForces; i++)
forceSets[0].push_back(i);
}
else {
// Figure out how many force buffers will be required.
for (int i = 0; i < numForces; i++)
numForceBuffers = max(numForceBuffers, numBuffers[i]);
int bufferLimit = max(numForceBuffers, (int) context.getPlatformData().contexts.size());
if (context.getNonbondedUtilities().getHasInteractions())
bufferLimit = max(bufferLimit, context.getNonbondedUtilities().getNumForceBuffers());
// Figure out sets of forces that can be merged.
vector<int> unmerged(numForces);
for (int i = 0; i < numForces; i++)
unmerged[i] = i;
for (int i = 0; i < numForces; i++)
for (int j = i-1; j >= 0; j--) {
if (numBuffers[unmerged[j]] <= numBuffers[unmerged[j+1]])
break;
int temp = unmerged[j+1];
unmerged[j+1] = unmerged[j];
unmerged[j] = temp;
}
while (unmerged.size() > 0) {
int sum = numBuffers[unmerged.back()];
int i;
for (i = 0; i < (int) unmerged.size()-1; i++) {
if (sum+numBuffers[unmerged[i]] > bufferLimit)
break;
sum += numBuffers[unmerged[i]];
}
forceSets.push_back(vector<int>());
for (int j = 0; j < i; j++)
forceSets.back().push_back(unmerged[j]);
forceSets.back().push_back(unmerged.back());
for (int j = 0; j < i; j++)
unmerged.erase(unmerged.begin());
unmerged.pop_back();
}
}
// Update the buffer indices based on merged sets.
bufferIndices.resize(numForces);
for (int i = 0; i < (int) forceSets.size(); i++)
for (int j = 0; j < (int) forceSets[i].size(); j++) {
int force = forceSets[i][j];
int numBonds = forceAtoms[force].size();
int numAtoms = forceAtoms[force][0].size();
int width = indexWidth[force];
for (int k = 0; k < j; k++)
for (int bond = 0; bond < numBonds; bond++)
for (int atom = 0; atom < numAtoms; atom++)
bufferVec[force][bond*width+atom] += bufferCounter[forceSets[i][k]][forceAtoms[force][bond][atom]];
bufferIndices[force].initialize<cl_uint>(context, bufferVec[force].size(), "bondedBufferIndices");
bufferIndices[force].upload(bufferVec[force]);
}
// Create the kernels.
// Create the kernel.
for (auto& set : forceSets) {
int setSize = set.size();
stringstream s;
s<<"#ifdef SUPPORTS_64_BIT_ATOMICS\n";
s<<"#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n";
s<<"#endif\n";
for (int i = 0; i < (int) prefixCode.size(); i++)
s<<prefixCode[i];
string bufferType = (context.getSupports64BitGlobalAtomics() ? "long" : "real4");
s<<"__kernel void computeBondedForces(__global "<<bufferType<<"* restrict forceBuffers, __global mixed* restrict energyBuffer, __global const real4* restrict posq, int groups, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ";
for (int i = 0; i < setSize; i++) {
int force = set[i];
s<<"__kernel void computeBondedForces(__global long* restrict forceBuffers, __global mixed* restrict energyBuffer, __global const real4* restrict posq, int groups, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ";
for (int force = 0; force < numForces; force++) {
string indexType = "uint"+(indexWidth[force] == 1 ? "" : context.intToString(indexWidth[force]));
s<<", __global const "<<indexType<<"* restrict atomIndices"<<i;
s<<", __global const "<<indexType<<"* restrict bufferIndices"<<i;
s<<", __global const "<<indexType<<"* restrict atomIndices"<<force;
}
for (int i = 0; i < (int) arguments.size(); i++)
s<<", __global "<<argTypes[i]<<"* customArg"<<(i+1);
......@@ -205,10 +119,8 @@ void OpenCLBondedUtilities::initialize(const System& system) {
s<<"mixed energy = 0;\n";
for (int i = 0; i < energyParameterDerivatives.size(); i++)
s<<"mixed energyParamDeriv"<<i<<" = 0;\n";
for (int i = 0; i < setSize; i++) {
int force = set[i];
s<<createForceSource(i, forceAtoms[force].size(), forceAtoms[force][0].size(), forceGroup[force], forceSource[force]);
}
for (int force = 0; force < numForces; force++)
s<<createForceSource(force, forceAtoms[force].size(), forceAtoms[force][0].size(), forceGroup[force], forceSource[force]);
s<<"energyBuffer[get_global_id(0)] += energy;\n";
const vector<string>& allParamDerivNames = context.getEnergyParamDerivNames();
int numDerivs = allParamDerivNames.size();
......@@ -220,8 +132,7 @@ void OpenCLBondedUtilities::initialize(const System& system) {
map<string, string> defines;
defines["PADDED_NUM_ATOMS"] = context.intToString(context.getPaddedNumAtoms());
cl::Program program = context.createProgram(s.str(), defines);
kernels.push_back(cl::Kernel(program, "computeBondedForces"));
}
kernel = cl::Kernel(program, "computeBondedForces");
forceAtoms.clear();
forceSource.clear();
}
......@@ -247,7 +158,6 @@ string OpenCLBondedUtilities::createForceSource(int forceIndex, int numBonds, in
s<<"if ((groups&"<<(1<<group)<<") != 0)\n";
s<<"for (unsigned int index = get_global_id(0); index < "<<numBonds<<"; index += get_global_size(0)) {\n";
s<<" "<<indexType<<" atoms = atomIndices"<<forceIndex<<"[index];\n";
s<<" "<<indexType<<" buffers = bufferIndices"<<forceIndex<<"[index];\n";
for (int i = 0; i < numAtoms; i++) {
s<<" unsigned int atom"<<(i+1)<<" = atoms"<<suffix[i]<<";\n";
s<<" real4 pos"<<(i+1)<<" = posq[atom"<<(i+1)<<"];\n";
......@@ -255,17 +165,9 @@ string OpenCLBondedUtilities::createForceSource(int forceIndex, int numBonds, in
s<<computeForce<<"\n";
for (int i = 0; i < numAtoms; i++) {
s<<" {\n";
if (context.getSupports64BitGlobalAtomics()) {
s<<" atom_add(&forceBuffers[atom"<<(i+1)<<"], realToFixedPoint(force"<<(i+1)<<".x));\n";
s<<" atom_add(&forceBuffers[atom"<<(i+1)<<"+PADDED_NUM_ATOMS], realToFixedPoint(force"<<(i+1)<<".y));\n";
s<<" atom_add(&forceBuffers[atom"<<(i+1)<<"+2*PADDED_NUM_ATOMS], realToFixedPoint(force"<<(i+1)<<".z));\n";
}
else {
s<<" unsigned int offset = atom"<<(i+1)<<"+buffers"<<suffix[i]<<"*PADDED_NUM_ATOMS;\n";
s<<" real4 force = forceBuffers[offset];\n";
s<<" force.xyz += force"<<(i+1)<<".xyz;\n";
s<<" forceBuffers[offset] = force;\n";
}
s<<" ATOMIC_ADD(&forceBuffers[atom"<<(i+1)<<"], (mm_ulong) realToFixedPoint(force"<<(i+1)<<".x));\n";
s<<" ATOMIC_ADD(&forceBuffers[atom"<<(i+1)<<"+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force"<<(i+1)<<".y));\n";
s<<" ATOMIC_ADD(&forceBuffers[atom"<<(i+1)<<"+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force"<<(i+1)<<".z));\n";
s<<" }\n";
}
s<<"}\n";
......@@ -277,28 +179,18 @@ void OpenCLBondedUtilities::computeInteractions(int groups) {
return;
if (!hasInitializedKernels) {
hasInitializedKernels = true;
for (int i = 0; i < (int) forceSets.size(); i++) {
int index = 0;
cl::Kernel& kernel = kernels[i];
if (context.getSupports64BitGlobalAtomics())
kernel.setArg<cl::Buffer>(index++, context.getLongForceBuffer().getDeviceBuffer());
else
kernel.setArg<cl::Buffer>(index++, context.getForceBuffers().getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, context.getEnergyBuffer().getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, context.getPosq().getDeviceBuffer());
index += 6;
for (int j = 0; j < (int) forceSets[i].size(); j++) {
kernel.setArg<cl::Buffer>(index++, atomIndices[forceSets[i][j]].getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, bufferIndices[forceSets[i][j]].getDeviceBuffer());
}
for (int j = 0; j < (int) atomIndices.size(); j++)
kernel.setArg<cl::Buffer>(index++, atomIndices[j].getDeviceBuffer());
for (int j = 0; j < (int) arguments.size(); j++)
kernel.setArg<cl::Memory>(index++, *arguments[j]);
if (energyParameterDerivatives.size() > 0)
kernel.setArg<cl::Memory>(index++, context.getEnergyParamDerivBuffer().getDeviceBuffer());
}
}
for (int i = 0; i < (int) kernels.size(); i++) {
cl::Kernel& kernel = kernels[i];
kernel.setArg<cl_int>(3, groups);
if (context.getUseDoublePrecision()) {
kernel.setArg<mm_double4>(4, context.getPeriodicBoxSizeDouble());
......@@ -314,6 +206,5 @@ void OpenCLBondedUtilities::computeInteractions(int groups) {
kernel.setArg<mm_float4>(7, context.getPeriodicBoxVecY());
kernel.setArg<mm_float4>(8, context.getPeriodicBoxVecZ());
}
context.executeKernel(kernels[i], maxBonds);
}
context.executeKernel(kernel, maxBonds);
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment