Unverified Commit ae686364 authored by Peter Eastman's avatar Peter Eastman Committed by GitHub
Browse files

Improved support for devices without 64 bit atomics (#3737)

parent 48664a1f
...@@ -786,8 +786,6 @@ private: ...@@ -786,8 +786,6 @@ private:
ComputeArray globals; ComputeArray globals;
ComputeArray donors; ComputeArray donors;
ComputeArray acceptors; ComputeArray acceptors;
ComputeArray donorBufferIndices;
ComputeArray acceptorBufferIndices;
ComputeArray donorExclusions; ComputeArray donorExclusions;
ComputeArray acceptorExclusions; ComputeArray acceptorExclusions;
std::vector<std::string> globalParamNames; std::vector<std::string> globalParamNames;
......
...@@ -1483,8 +1483,6 @@ void CommonCalcCustomCentroidBondForceKernel::initialize(const System& system, c ...@@ -1483,8 +1483,6 @@ void CommonCalcCustomCentroidBondForceKernel::initialize(const System& system, c
numBonds = force.getNumBonds(); numBonds = force.getNumBonds();
if (numBonds == 0) if (numBonds == 0)
return; return;
if (!cc.getSupports64BitGlobalAtomics())
throw OpenMMException("CustomCentroidBondForce requires a device that supports 64 bit atomic operations");
info = new ForceInfo(force); info = new ForceInfo(force);
cc.addForce(info); cc.addForce(info);
...@@ -2380,8 +2378,7 @@ double CommonCalcCustomNonbondedForceKernel::execute(ContextImpl& context, bool ...@@ -2380,8 +2378,7 @@ double CommonCalcCustomNonbondedForceKernel::execute(ContextImpl& context, bool
if (interactionGroupData.isInitialized()) { if (interactionGroupData.isInitialized()) {
if (!hasInitializedKernel) { if (!hasInitializedKernel) {
hasInitializedKernel = true; hasInitializedKernel = true;
bool useLong = cc.getSupports64BitGlobalAtomics(); interactionGroupKernel->addArg(cc.getLongForceBuffer());
interactionGroupKernel->addArg((useLong ? cc.getLongForceBuffer() : cc.getForceBuffers()));
interactionGroupKernel->addArg(cc.getEnergyBuffer()); interactionGroupKernel->addArg(cc.getEnergyBuffer());
interactionGroupKernel->addArg(cc.getPosq()); interactionGroupKernel->addArg(cc.getPosq());
interactionGroupKernel->addArg((useNeighborList ? filteredGroupData : interactionGroupData)); interactionGroupKernel->addArg((useNeighborList ? filteredGroupData : interactionGroupData));
...@@ -2501,15 +2498,8 @@ void CommonCalcGBSAOBCForceKernel::initialize(const System& system, const GBSAOB ...@@ -2501,15 +2498,8 @@ void CommonCalcGBSAOBCForceKernel::initialize(const System& system, const GBSAOB
charges.initialize(cc, cc.getPaddedNumAtoms(), elementSize, "gbsaObcCharges"); charges.initialize(cc, cc.getPaddedNumAtoms(), elementSize, "gbsaObcCharges");
bornRadii.initialize(cc, cc.getPaddedNumAtoms(), elementSize, "bornRadii"); bornRadii.initialize(cc, cc.getPaddedNumAtoms(), elementSize, "bornRadii");
obcChain.initialize(cc, cc.getPaddedNumAtoms(), elementSize, "obcChain"); obcChain.initialize(cc, cc.getPaddedNumAtoms(), elementSize, "obcChain");
if (cc.getSupports64BitGlobalAtomics()) {
bornSum.initialize<long long>(cc, cc.getPaddedNumAtoms(), "bornSum"); bornSum.initialize<long long>(cc, cc.getPaddedNumAtoms(), "bornSum");
bornForce.initialize<long long>(cc, cc.getPaddedNumAtoms(), "bornForce"); bornForce.initialize<long long>(cc, cc.getPaddedNumAtoms(), "bornForce");
}
else {
int bufferSize = cc.getPaddedNumAtoms()*nb.getNumForceBuffers();
bornSum.initialize(cc, bufferSize, elementSize, "bornSum");
bornForce.initialize(cc, bufferSize, elementSize, "bornForce");
}
cc.addAutoclearBuffer(bornSum); cc.addAutoclearBuffer(bornSum);
cc.addAutoclearBuffer(bornForce); cc.addAutoclearBuffer(bornForce);
vector<double> chargeVec(cc.getPaddedNumAtoms()); vector<double> chargeVec(cc.getPaddedNumAtoms());
...@@ -2541,7 +2531,7 @@ void CommonCalcGBSAOBCForceKernel::initialize(const System& system, const GBSAOB ...@@ -2541,7 +2531,7 @@ void CommonCalcGBSAOBCForceKernel::initialize(const System& system, const GBSAOB
nb.addInteraction(useCutoff, usePeriodic, false, cutoff, vector<vector<int> >(), source, force.getForceGroup()); nb.addInteraction(useCutoff, usePeriodic, false, cutoff, vector<vector<int> >(), source, force.getForceGroup());
nb.addParameter(ComputeParameterInfo(charges, prefix+"charge", "float", 1)); nb.addParameter(ComputeParameterInfo(charges, prefix+"charge", "float", 1));
nb.addParameter(ComputeParameterInfo(params, prefix+"obcParams", "float", 2)); nb.addParameter(ComputeParameterInfo(params, prefix+"obcParams", "float", 2));
nb.addParameter(ComputeParameterInfo(bornForce, prefix+"bornForce", cc.getSupports64BitGlobalAtomics() ? "mm_long" : "real", 1)); nb.addParameter(ComputeParameterInfo(bornForce, prefix+"bornForce", "mm_long", 1));
info = new ForceInfo(force); info = new ForceInfo(force);
cc.addForce(info); cc.addForce(info);
} }
...@@ -2580,7 +2570,6 @@ double CommonCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF ...@@ -2580,7 +2570,6 @@ double CommonCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
else else
file = CommonKernelSources::gbsaObc; file = CommonKernelSources::gbsaObc;
ComputeProgram program = cc.compileProgram(file, defines); ComputeProgram program = cc.compileProgram(file, defines);
bool useLong = cc.getSupports64BitGlobalAtomics();
computeBornSumKernel = program->createKernel("computeBornSum"); computeBornSumKernel = program->createKernel("computeBornSum");
computeBornSumKernel->addArg(bornSum); computeBornSumKernel->addArg(bornSum);
computeBornSumKernel->addArg(cc.getPosq()); computeBornSumKernel->addArg(cc.getPosq());
...@@ -2600,7 +2589,7 @@ double CommonCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF ...@@ -2600,7 +2589,7 @@ double CommonCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
computeBornSumKernel->addArg(numAtomBlocks*(numAtomBlocks+1)/2); computeBornSumKernel->addArg(numAtomBlocks*(numAtomBlocks+1)/2);
computeBornSumKernel->addArg(nb.getExclusionTiles()); computeBornSumKernel->addArg(nb.getExclusionTiles());
force1Kernel = program->createKernel("computeGBSAForce1"); force1Kernel = program->createKernel("computeGBSAForce1");
force1Kernel->addArg(useLong ? cc.getLongForceBuffer() : cc.getForceBuffers()); force1Kernel->addArg(cc.getLongForceBuffer());
force1Kernel->addArg(bornForce); force1Kernel->addArg(bornForce);
force1Kernel->addArg(cc.getEnergyBuffer()); force1Kernel->addArg(cc.getEnergyBuffer());
force1Kernel->addArg(cc.getPosq()); force1Kernel->addArg(cc.getPosq());
...@@ -2626,19 +2615,11 @@ double CommonCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF ...@@ -2626,19 +2615,11 @@ double CommonCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
reduceBornSumKernel->addArg(0.8f); reduceBornSumKernel->addArg(0.8f);
reduceBornSumKernel->addArg(4.85f); reduceBornSumKernel->addArg(4.85f);
reduceBornSumKernel->addArg(bornSum); reduceBornSumKernel->addArg(bornSum);
if (!useLong) {
reduceBornSumKernel->addArg(cc.getPaddedNumAtoms());
reduceBornSumKernel->addArg(cc.getForceBuffers().getSize()/cc.getPaddedNumAtoms());
}
reduceBornSumKernel->addArg(params); reduceBornSumKernel->addArg(params);
reduceBornSumKernel->addArg(bornRadii); reduceBornSumKernel->addArg(bornRadii);
reduceBornSumKernel->addArg(obcChain); reduceBornSumKernel->addArg(obcChain);
reduceBornForceKernel = program->createKernel("reduceBornForce"); reduceBornForceKernel = program->createKernel("reduceBornForce");
reduceBornForceKernel->addArg(bornForce); reduceBornForceKernel->addArg(bornForce);
if (!useLong) {
reduceBornForceKernel->addArg(cc.getPaddedNumAtoms());
reduceBornForceKernel->addArg(cc.getForceBuffers().getSize()/cc.getPaddedNumAtoms());
}
reduceBornForceKernel->addArg(cc.getEnergyBuffer()); reduceBornForceKernel->addArg(cc.getEnergyBuffer());
reduceBornForceKernel->addArg(params); reduceBornForceKernel->addArg(params);
reduceBornForceKernel->addArg(bornRadii); reduceBornForceKernel->addArg(bornRadii);
...@@ -2872,28 +2853,17 @@ void CommonCalcCustomGBForceKernel::initialize(const System& system, const Custo ...@@ -2872,28 +2853,17 @@ void CommonCalcCustomGBForceKernel::initialize(const System& system, const Custo
energyParamDerivExpressions[i].push_back(ex.differentiate(force.getEnergyParameterDerivativeName(j)).optimize()); energyParamDerivExpressions[i].push_back(ex.differentiate(force.getEnergyParameterDerivativeName(j)).optimize());
} }
bool deviceIsCpu = cc.getIsCPU(); bool deviceIsCpu = cc.getIsCPU();
bool useLong = cc.getSupports64BitGlobalAtomics();
int elementSize = (cc.getUseDoublePrecision() ? sizeof(double) : sizeof(float)); int elementSize = (cc.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
if (useLong) {
valueBuffers.initialize<long long>(cc, cc.getPaddedNumAtoms(), "customGBValueBuffers"); valueBuffers.initialize<long long>(cc, cc.getPaddedNumAtoms(), "customGBValueBuffers");
longEnergyDerivs.initialize<long long>(cc, numComputedValues*cc.getPaddedNumAtoms(), "customGBLongEnergyDerivatives"); longEnergyDerivs.initialize<long long>(cc, numComputedValues*cc.getPaddedNumAtoms(), "customGBLongEnergyDerivatives");
energyDerivs = new ComputeParameterSet(cc, numComputedValues, cc.getPaddedNumAtoms(), "customGBEnergyDerivatives", true); energyDerivs = new ComputeParameterSet(cc, numComputedValues, cc.getPaddedNumAtoms(), "customGBEnergyDerivatives", true);
}
else {
int bufferSize = cc.getPaddedNumAtoms()*nb.getNumForceBuffers();
valueBuffers.initialize(cc, bufferSize, elementSize, "customGBValueBuffers");
energyDerivs = new ComputeParameterSet(cc, numComputedValues, bufferSize, "customGBEnergyDerivatives", true);
}
cc.addAutoclearBuffer(valueBuffers); cc.addAutoclearBuffer(valueBuffers);
energyDerivChain = new ComputeParameterSet(cc, numComputedValues, cc.getPaddedNumAtoms(), "customGBEnergyDerivativeChain", true); energyDerivChain = new ComputeParameterSet(cc, numComputedValues, cc.getPaddedNumAtoms(), "customGBEnergyDerivativeChain", true);
needEnergyParamDerivs = (force.getNumEnergyParameterDerivatives() > 0); needEnergyParamDerivs = (force.getNumEnergyParameterDerivatives() > 0);
dValue0dParam.resize(force.getNumEnergyParameterDerivatives()); dValue0dParam.resize(force.getNumEnergyParameterDerivatives());
for (int i = 0; i < force.getNumEnergyParameterDerivatives(); i++) { for (int i = 0; i < force.getNumEnergyParameterDerivatives(); i++) {
dValuedParam.push_back(new ComputeParameterSet(cc, numComputedValues, cc.getPaddedNumAtoms(), "dValuedParam", true, cc.getUseDoublePrecision())); dValuedParam.push_back(new ComputeParameterSet(cc, numComputedValues, cc.getPaddedNumAtoms(), "dValuedParam", true, cc.getUseDoublePrecision()));
if (useLong)
dValue0dParam[i].initialize<long long>(cc, cc.getPaddedNumAtoms(), "dValue0dParam"); dValue0dParam[i].initialize<long long>(cc, cc.getPaddedNumAtoms(), "dValue0dParam");
else
dValue0dParam[i].initialize(cc, cc.getPaddedNumAtoms()*nb.getNumForceBuffers(), elementSize, "dValue0dParam");
cc.addAutoclearBuffer(dValue0dParam[i]); cc.addAutoclearBuffer(dValue0dParam[i]);
string name = force.getEnergyParameterDerivativeName(i); string name = force.getEnergyParameterDerivativeName(i);
cc.addEnergyParameterDerivative(name); cc.addEnergyParameterDerivative(name);
...@@ -2960,10 +2930,7 @@ void CommonCalcCustomGBForceKernel::initialize(const System& system, const Custo ...@@ -2960,10 +2930,7 @@ void CommonCalcCustomGBForceKernel::initialize(const System& system, const Custo
} }
for (int i = 0; i < force.getNumEnergyParameterDerivatives(); i++) { for (int i = 0; i < force.getNumEnergyParameterDerivatives(); i++) {
string derivName = "dValue0dParam"+cc.intToString(i+1); string derivName = "dValue0dParam"+cc.intToString(i+1);
if (useLong)
extraArgs << ", GLOBAL mm_ulong* RESTRICT global_" << derivName; extraArgs << ", GLOBAL mm_ulong* RESTRICT global_" << derivName;
else
extraArgs << ", GLOBAL real* RESTRICT global_" << derivName;
atomParams << "LOCAL real local_" << derivName << "[LOCAL_BUFFER_SIZE];\n"; atomParams << "LOCAL real local_" << derivName << "[LOCAL_BUFFER_SIZE];\n";
loadLocal2 << "local_" << derivName << "[localAtomIndex] = 0;\n"; loadLocal2 << "local_" << derivName << "[localAtomIndex] = 0;\n";
load1 << "real " << derivName << " = 0;\n"; load1 << "real " << derivName << " = 0;\n";
...@@ -2975,21 +2942,12 @@ void CommonCalcCustomGBForceKernel::initialize(const System& system, const Custo ...@@ -2975,21 +2942,12 @@ void CommonCalcCustomGBForceKernel::initialize(const System& system, const Custo
tempDerivs2 << "local_" << derivName << "[j] += temp_" << derivName << "_2;\n"; tempDerivs2 << "local_" << derivName << "[j] += temp_" << derivName << "_2;\n";
else else
tempDerivs2 << "local_" << derivName << "[tbx+tj] += temp_" << derivName << "_2;\n"; tempDerivs2 << "local_" << derivName << "[tbx+tj] += temp_" << derivName << "_2;\n";
if (useLong) {
storeDeriv1 << "ATOMIC_ADD(&global_" << derivName << "[offset1], (mm_ulong) realToFixedPoint(" << derivName << "));\n"; storeDeriv1 << "ATOMIC_ADD(&global_" << derivName << "[offset1], (mm_ulong) realToFixedPoint(" << derivName << "));\n";
if (deviceIsCpu) if (deviceIsCpu)
storeDeriv2 << "ATOMIC_ADD(&global_" << derivName << "[offset2], (mm_ulong) realToFixedPoint(local_" << derivName << "[tgx]));\n"; storeDeriv2 << "ATOMIC_ADD(&global_" << derivName << "[offset2], (mm_ulong) realToFixedPoint(local_" << derivName << "[tgx]));\n";
else else
storeDeriv2 << "ATOMIC_ADD(&global_" << derivName << "[offset2], (mm_ulong) realToFixedPoint(local_" << derivName << "[LOCAL_ID]));\n"; storeDeriv2 << "ATOMIC_ADD(&global_" << derivName << "[offset2], (mm_ulong) realToFixedPoint(local_" << derivName << "[LOCAL_ID]));\n";
} }
else {
storeDeriv1 << "global_" << derivName << "[offset1] += " << derivName << ";\n";
if (deviceIsCpu)
storeDeriv2 << "global_" << derivName << "[offset2] += local_" << derivName << "[tgx];\n";
else
storeDeriv2 << "global_" << derivName << "[offset2] += local_" << derivName << "[LOCAL_ID];\n";
}
}
} }
replacements["PARAMETER_ARGUMENTS"] = extraArgs.str()+tableArgs.str(); replacements["PARAMETER_ARGUMENTS"] = extraArgs.str()+tableArgs.str();
replacements["ATOM_PARAMETER_DATA"] = atomParams.str(); replacements["ATOM_PARAMETER_DATA"] = atomParams.str();
...@@ -3039,16 +2997,8 @@ void CommonCalcCustomGBForceKernel::initialize(const System& system, const Custo ...@@ -3039,16 +2997,8 @@ void CommonCalcCustomGBForceKernel::initialize(const System& system, const Custo
} }
for (int i = 0; i < force.getNumEnergyParameterDerivatives(); i++) { for (int i = 0; i < force.getNumEnergyParameterDerivatives(); i++) {
string variableName = "dValuedParam_0_"+cc.intToString(i); string variableName = "dValuedParam_0_"+cc.intToString(i);
if (useLong) {
extraArgs << ", GLOBAL const mm_long* RESTRICT dValue0dParam" << i; extraArgs << ", GLOBAL const mm_long* RESTRICT dValue0dParam" << i;
deriv0 << "real " << variableName << " = RECIP((real) 0x100000000)*dValue0dParam" << i << "[index];\n"; deriv0 << "real " << variableName << " = RECIP((real) 0x100000000)*dValue0dParam" << i << "[index];\n";
}
else {
extraArgs << ", GLOBAL const real* RESTRICT dValue0dParam" << i;
deriv0 << "real " << variableName << " = dValue0dParam" << i << "[index];\n";
deriv0 << "for (int i = index+bufferSize; i < totalSize; i += bufferSize)\n";
deriv0 << " " << variableName << " += dValue0dParam" << i << "[i];\n";
}
for (int j = 0; j < dValuedParam[i]->getParameterInfos().size(); j++) for (int j = 0; j < dValuedParam[i]->getParameterInfos().size(); j++)
extraArgs << ", GLOBAL real* RESTRICT global_dValuedParam_" << j << "_" << i; extraArgs << ", GLOBAL real* RESTRICT global_dValuedParam_" << j << "_" << i;
deriv0 << "global_dValuedParam_0_" << i << "[index] = dValuedParam_0_" << i << ";\n"; deriv0 << "global_dValuedParam_0_" << i << "[index] = dValuedParam_0_" << i << ";\n";
...@@ -3129,7 +3079,6 @@ void CommonCalcCustomGBForceKernel::initialize(const System& system, const Custo ...@@ -3129,7 +3079,6 @@ void CommonCalcCustomGBForceKernel::initialize(const System& system, const Custo
map<string, Lepton::ParsedExpression> n2EnergyExpressions; map<string, Lepton::ParsedExpression> n2EnergyExpressions;
n2EnergyExpressions["tempEnergy += "] = Lepton::Parser::parse(expression, functions).optimize(); n2EnergyExpressions["tempEnergy += "] = Lepton::Parser::parse(expression, functions).optimize();
n2EnergyExpressions["dEdR += "] = Lepton::Parser::parse(expression, functions).differentiate("r").optimize(); n2EnergyExpressions["dEdR += "] = Lepton::Parser::parse(expression, functions).differentiate("r").optimize();
if (useLong) {
for (int j = 0; j < numComputedValues; j++) { for (int j = 0; j < numComputedValues; j++) {
if (needChainForValue[j]) { if (needChainForValue[j]) {
string index = cc.intToString(j+1); string index = cc.intToString(j+1);
...@@ -3137,15 +3086,6 @@ void CommonCalcCustomGBForceKernel::initialize(const System& system, const Custo ...@@ -3137,15 +3086,6 @@ void CommonCalcCustomGBForceKernel::initialize(const System& system, const Custo
n2EnergyExpressions["/*"+cc.intToString(i+1)+"*/ deriv"+index+"_2 += "] = energyDerivExpressions[i][2*j+1]; n2EnergyExpressions["/*"+cc.intToString(i+1)+"*/ deriv"+index+"_2 += "] = energyDerivExpressions[i][2*j+1];
} }
} }
}
else {
for (int j = 0; j < numComputedValues; j++) {
if (needChainForValue[j]) {
n2EnergyExpressions["/*"+cc.intToString(i+1)+"*/ deriv"+energyDerivs->getParameterSuffix(j, "_1")+" += "] = energyDerivExpressions[i][2*j];
n2EnergyExpressions["/*"+cc.intToString(i+1)+"*/ deriv"+energyDerivs->getParameterSuffix(j, "_2")+" += "] = energyDerivExpressions[i][2*j+1];
}
}
}
for (int j = 0; j < force.getNumEnergyParameterDerivatives(); j++) for (int j = 0; j < force.getNumEnergyParameterDerivatives(); j++)
n2EnergyExpressions["energyParamDeriv"+cc.intToString(j)+" += interactionScale*"] = energyParamDerivExpressions[i][j]; n2EnergyExpressions["energyParamDeriv"+cc.intToString(j)+" += interactionScale*"] = energyParamDerivExpressions[i][j];
if (exclude) if (exclude)
...@@ -3188,7 +3128,6 @@ void CommonCalcCustomGBForceKernel::initialize(const System& system, const Custo ...@@ -3188,7 +3128,6 @@ void CommonCalcCustomGBForceKernel::initialize(const System& system, const Custo
pairEnergyUsesValue[i] = true; pairEnergyUsesValue[i] = true;
} }
} }
if (useLong) {
extraArgs << ", GLOBAL mm_ulong* RESTRICT derivBuffers"; extraArgs << ", GLOBAL mm_ulong* RESTRICT derivBuffers";
for (int i = 0; i < numComputedValues; i++) { for (int i = 0; i < numComputedValues; i++) {
string index = cc.intToString(i+1); string index = cc.intToString(i+1);
...@@ -3200,21 +3139,6 @@ void CommonCalcCustomGBForceKernel::initialize(const System& system, const Custo ...@@ -3200,21 +3139,6 @@ void CommonCalcCustomGBForceKernel::initialize(const System& system, const Custo
storeDerivs1 << "STORE_DERIVATIVE_1(" << index << ")\n"; storeDerivs1 << "STORE_DERIVATIVE_1(" << index << ")\n";
storeDerivs2 << "STORE_DERIVATIVE_2(" << index << ")\n"; storeDerivs2 << "STORE_DERIVATIVE_2(" << index << ")\n";
} }
}
else {
for (int i = 0; i < (int) energyDerivs->getParameterInfos().size(); i++) {
const ComputeParameterInfo& buffer = energyDerivs->getParameterInfos()[i];
string index = cc.intToString(i+1);
extraArgs << ", GLOBAL " << buffer.getType() << "* RESTRICT derivBuffers" << index;
atomParams << "LOCAL " << buffer.getType() << " local_deriv" << index << "[LOCAL_BUFFER_SIZE];\n";
clearLocal << "local_deriv" << index << "[localAtomIndex] = 0.0f;\n";
declare1 << buffer.getType() << " deriv" << index << "_1 = 0.0f;\n";
load2 << buffer.getType() << " deriv" << index << "_2 = 0.0f;\n";
recordDeriv << "local_deriv" << index << "[atom2] += deriv" << index << "_2;\n";
storeDerivs1 << "STORE_DERIVATIVE_1(" << index << ")\n";
storeDerivs2 << "STORE_DERIVATIVE_2(" << index << ")\n";
}
}
if (needEnergyParamDerivs) { if (needEnergyParamDerivs) {
extraArgs << ", GLOBAL mixed* RESTRICT energyParamDerivs"; extraArgs << ", GLOBAL mixed* RESTRICT energyParamDerivs";
const vector<string>& allParamDerivNames = cc.getEnergyParamDerivNames(); const vector<string>& allParamDerivNames = cc.getEnergyParamDerivNames();
...@@ -3285,16 +3209,10 @@ void CommonCalcCustomGBForceKernel::initialize(const System& system, const Custo ...@@ -3285,16 +3209,10 @@ void CommonCalcCustomGBForceKernel::initialize(const System& system, const Custo
string index = cc.intToString(i+1); string index = cc.intToString(i+1);
extraArgs << ", GLOBAL " << buffer.getType() << "* RESTRICT derivChain" << index; extraArgs << ", GLOBAL " << buffer.getType() << "* RESTRICT derivChain" << index;
} }
if (useLong) {
extraArgs << ", GLOBAL const mm_long* RESTRICT derivBuffersIn"; extraArgs << ", GLOBAL const mm_long* RESTRICT derivBuffersIn";
for (int i = 0; i < energyDerivs->getNumParameters(); ++i) for (int i = 0; i < energyDerivs->getNumParameters(); ++i)
reduce << "derivBuffers" << energyDerivs->getParameterSuffix(i, "[index]") << reduce << "derivBuffers" << energyDerivs->getParameterSuffix(i, "[index]") <<
" = RECIP((real) 0x100000000)*derivBuffersIn[index+PADDED_NUM_ATOMS*" << cc.intToString(i) << "];\n"; " = RECIP((real) 0x100000000)*derivBuffersIn[index+PADDED_NUM_ATOMS*" << cc.intToString(i) << "];\n";
}
else {
for (int i = 0; i < (int) energyDerivs->getParameterInfos().size(); i++)
reduce << "REDUCE_VALUE(derivBuffers" << cc.intToString(i+1) << ", " << energyDerivs->getParameterInfos()[i].getType() << ")\n";
}
if (needEnergyParamDerivs) { if (needEnergyParamDerivs) {
extraArgs << ", GLOBAL mixed* RESTRICT energyParamDerivs"; extraArgs << ", GLOBAL mixed* RESTRICT energyParamDerivs";
const vector<string>& allParamDerivNames = cc.getEnergyParamDerivNames(); const vector<string>& allParamDerivNames = cc.getEnergyParamDerivNames();
...@@ -3353,13 +3271,9 @@ void CommonCalcCustomGBForceKernel::initialize(const System& system, const Custo ...@@ -3353,13 +3271,9 @@ void CommonCalcCustomGBForceKernel::initialize(const System& system, const Custo
string index = cc.intToString(i+1); string index = cc.intToString(i+1);
compute << "derivBuffers" << index << "[index] = deriv" << index << ";\n"; compute << "derivBuffers" << index << "[index] = deriv" << index << ";\n";
} }
if (useLong) {
compute << "forceBuffers[index] += realToFixedPoint(force.x);\n"; compute << "forceBuffers[index] += realToFixedPoint(force.x);\n";
compute << "forceBuffers[index+PADDED_NUM_ATOMS] += realToFixedPoint(force.y);\n"; compute << "forceBuffers[index+PADDED_NUM_ATOMS] += realToFixedPoint(force.y);\n";
compute << "forceBuffers[index+PADDED_NUM_ATOMS*2] += realToFixedPoint(force.z);\n"; compute << "forceBuffers[index+PADDED_NUM_ATOMS*2] += realToFixedPoint(force.z);\n";
}
else
compute << "forceBuffers[index] = forceBuffers[index]+make_real4(force.x, force.y, force.z, 0);\n";
for (int i = 1; i < numComputedValues; i++) { for (int i = 1; i < numComputedValues; i++) {
compute << "real totalDeriv"<<i<<" = dV"<<i<<"dV0"; compute << "real totalDeriv"<<i<<" = dV"<<i<<"dV0";
for (int j = 1; j < i; j++) for (int j = 1; j < i; j++)
...@@ -3548,12 +3462,7 @@ void CommonCalcCustomGBForceKernel::initialize(const System& system, const Custo ...@@ -3548,12 +3462,7 @@ void CommonCalcCustomGBForceKernel::initialize(const System& system, const Custo
} }
info = new ForceInfo(force); info = new ForceInfo(force);
cc.addForce(info); cc.addForce(info);
if (useLong)
cc.addAutoclearBuffer(longEnergyDerivs); cc.addAutoclearBuffer(longEnergyDerivs);
else {
for (auto& buffer : energyDerivs->getParameterInfos())
cc.addAutoclearBuffer(buffer.getArray());
}
} }
double CommonCalcCustomGBForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) { double CommonCalcCustomGBForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
...@@ -3600,7 +3509,6 @@ double CommonCalcCustomGBForceKernel::execute(ContextImpl& context, bool include ...@@ -3600,7 +3509,6 @@ double CommonCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
maxTiles = (nb.getUseCutoff() ? nb.getInteractingTiles().getSize() : 0); maxTiles = (nb.getUseCutoff() ? nb.getInteractingTiles().getSize() : 0);
int numAtomBlocks = cc.getPaddedNumAtoms()/32; int numAtomBlocks = cc.getPaddedNumAtoms()/32;
bool useLong = cc.getSupports64BitGlobalAtomics();
pairValueKernel->addArg(cc.getPosq()); pairValueKernel->addArg(cc.getPosq());
pairValueKernel->addArg(cc.getNonbondedUtilities().getExclusions()); pairValueKernel->addArg(cc.getNonbondedUtilities().getExclusions());
pairValueKernel->addArg(cc.getNonbondedUtilities().getExclusionTiles()); pairValueKernel->addArg(cc.getNonbondedUtilities().getExclusionTiles());
...@@ -3631,10 +3539,6 @@ double CommonCalcCustomGBForceKernel::execute(ContextImpl& context, bool include ...@@ -3631,10 +3539,6 @@ double CommonCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
pairValueKernel->addArg(function); pairValueKernel->addArg(function);
perParticleValueKernel->addArg(cc.getPosq()); perParticleValueKernel->addArg(cc.getPosq());
perParticleValueKernel->addArg(valueBuffers); perParticleValueKernel->addArg(valueBuffers);
if (!useLong) {
perParticleValueKernel->addArg(cc.getPaddedNumAtoms());
perParticleValueKernel->addArg(cc.getForceBuffers().getSize()/cc.getPaddedNumAtoms());
}
if (globals.isInitialized()) if (globals.isInitialized())
perParticleValueKernel->addArg(globals); perParticleValueKernel->addArg(globals);
for (auto& buffer : params->getParameterInfos()) for (auto& buffer : params->getParameterInfos())
...@@ -3648,7 +3552,7 @@ double CommonCalcCustomGBForceKernel::execute(ContextImpl& context, bool include ...@@ -3648,7 +3552,7 @@ double CommonCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
} }
for (auto& function : tabulatedFunctionArrays) for (auto& function : tabulatedFunctionArrays)
perParticleValueKernel->addArg(function); perParticleValueKernel->addArg(function);
pairEnergyKernel->addArg(useLong ? cc.getLongForceBuffer() : cc.getForceBuffers()); pairEnergyKernel->addArg(cc.getLongForceBuffer());
pairEnergyKernel->addArg(cc.getEnergyBuffer()); pairEnergyKernel->addArg(cc.getEnergyBuffer());
pairEnergyKernel->addArg(cc.getPosq()); pairEnergyKernel->addArg(cc.getPosq());
pairEnergyKernel->addArg(cc.getNonbondedUtilities().getExclusions()); pairEnergyKernel->addArg(cc.getNonbondedUtilities().getExclusions());
...@@ -3680,24 +3584,14 @@ double CommonCalcCustomGBForceKernel::execute(ContextImpl& context, bool include ...@@ -3680,24 +3584,14 @@ double CommonCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
pairEnergyKernel->addArg(buffer.getArray()); pairEnergyKernel->addArg(buffer.getArray());
} }
} }
if (useLong)
pairEnergyKernel->addArg(longEnergyDerivs); pairEnergyKernel->addArg(longEnergyDerivs);
else
for (auto& buffer : energyDerivs->getParameterInfos())
pairEnergyKernel->addArg(buffer.getArray());
if (needEnergyParamDerivs) if (needEnergyParamDerivs)
pairEnergyKernel->addArg(cc.getEnergyParamDerivBuffer()); pairEnergyKernel->addArg(cc.getEnergyParamDerivBuffer());
for (auto& function : tabulatedFunctionArrays) for (auto& function : tabulatedFunctionArrays)
pairEnergyKernel->addArg(function); pairEnergyKernel->addArg(function);
perParticleEnergyKernel->addArg(cc.getEnergyBuffer()); perParticleEnergyKernel->addArg(cc.getEnergyBuffer());
perParticleEnergyKernel->addArg(cc.getPosq()); perParticleEnergyKernel->addArg(cc.getPosq());
if (cc.getSupports64BitGlobalAtomics())
perParticleEnergyKernel->addArg(cc.getLongForceBuffer()); perParticleEnergyKernel->addArg(cc.getLongForceBuffer());
else {
perParticleEnergyKernel->addArg(cc.getForceBuffers());
perParticleEnergyKernel->addArg(cc.getPaddedNumAtoms());
perParticleEnergyKernel->addArg(cc.getForceBuffers().getSize()/cc.getPaddedNumAtoms());
}
if (globals.isInitialized()) if (globals.isInitialized())
perParticleEnergyKernel->addArg(globals); perParticleEnergyKernel->addArg(globals);
for (auto& buffer : params->getParameterInfos()) for (auto& buffer : params->getParameterInfos())
...@@ -3708,7 +3602,6 @@ double CommonCalcCustomGBForceKernel::execute(ContextImpl& context, bool include ...@@ -3708,7 +3602,6 @@ double CommonCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
perParticleEnergyKernel->addArg(buffer.getArray()); perParticleEnergyKernel->addArg(buffer.getArray());
for (auto& buffer : energyDerivChain->getParameterInfos()) for (auto& buffer : energyDerivChain->getParameterInfos())
perParticleEnergyKernel->addArg(buffer.getArray()); perParticleEnergyKernel->addArg(buffer.getArray());
if (useLong)
perParticleEnergyKernel->addArg(longEnergyDerivs); perParticleEnergyKernel->addArg(longEnergyDerivs);
if (needEnergyParamDerivs) if (needEnergyParamDerivs)
perParticleEnergyKernel->addArg(cc.getEnergyParamDerivBuffer()); perParticleEnergyKernel->addArg(cc.getEnergyParamDerivBuffer());
...@@ -3716,7 +3609,7 @@ double CommonCalcCustomGBForceKernel::execute(ContextImpl& context, bool include ...@@ -3716,7 +3609,7 @@ double CommonCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
perParticleEnergyKernel->addArg(function); perParticleEnergyKernel->addArg(function);
if (needParameterGradient || needEnergyParamDerivs) { if (needParameterGradient || needEnergyParamDerivs) {
gradientChainRuleKernel->addArg(cc.getPosq()); gradientChainRuleKernel->addArg(cc.getPosq());
gradientChainRuleKernel->addArg(useLong ? cc.getLongForceBuffer() : cc.getForceBuffers()); gradientChainRuleKernel->addArg(cc.getLongForceBuffer());
if (globals.isInitialized()) if (globals.isInitialized())
gradientChainRuleKernel->addArg(globals); gradientChainRuleKernel->addArg(globals);
for (auto& buffer : params->getParameterInfos()) for (auto& buffer : params->getParameterInfos())
...@@ -3937,33 +3830,6 @@ void CommonCalcCustomHbondForceKernel::initialize(const System& system, const Cu ...@@ -3937,33 +3830,6 @@ void CommonCalcCustomHbondForceKernel::initialize(const System& system, const Cu
} }
acceptors.upload(acceptorVector); acceptors.upload(acceptorVector);
acceptorParams->setParameterValues(acceptorParamVector); acceptorParams->setParameterValues(acceptorParamVector);
// Select an output buffer index for each donor and acceptor.
if (!cc.getSupports64BitGlobalAtomics()) {
donorBufferIndices.initialize<mm_int4>(cc, numDonors, "customHbondDonorBuffers");
acceptorBufferIndices.initialize<mm_int4>(cc, numAcceptors, "customHbondAcceptorBuffers");
vector<mm_int4> donorBufferVector(numDonors);
vector<mm_int4> acceptorBufferVector(numAcceptors);
vector<int> donorBufferCounter(numParticles, 0);
for (int i = 0; i < numDonors; i++)
donorBufferVector[i] = mm_int4(donorVector[i].x > -1 ? donorBufferCounter[donorVector[i].x]++ : 0,
donorVector[i].y > -1 ? donorBufferCounter[donorVector[i].y]++ : 0,
donorVector[i].z > -1 ? donorBufferCounter[donorVector[i].z]++ : 0, 0);
vector<int> acceptorBufferCounter(numParticles, 0);
for (int i = 0; i < numAcceptors; i++)
acceptorBufferVector[i] = mm_int4(acceptorVector[i].x > -1 ? acceptorBufferCounter[acceptorVector[i].x]++ : 0,
acceptorVector[i].y > -1 ? acceptorBufferCounter[acceptorVector[i].y]++ : 0,
acceptorVector[i].z > -1 ? acceptorBufferCounter[acceptorVector[i].z]++ : 0, 0);
donorBufferIndices.upload(donorBufferVector);
acceptorBufferIndices.upload(acceptorBufferVector);
int maxBuffers = 1;
for (int i : donorBufferCounter)
maxBuffers = max(maxBuffers, i);
for (int i : acceptorBufferCounter)
maxBuffers = max(maxBuffers, i);
cc.requestForceBuffers(maxBuffers);
}
info = new ForceInfo(force); info = new ForceInfo(force);
cc.addForce(info); cc.addForce(info);
...@@ -4243,12 +4109,7 @@ double CommonCalcCustomHbondForceKernel::execute(ContextImpl& context, bool incl ...@@ -4243,12 +4109,7 @@ double CommonCalcCustomHbondForceKernel::execute(ContextImpl& context, bool incl
} }
if (!hasInitializedKernel) { if (!hasInitializedKernel) {
hasInitializedKernel = true; hasInitializedKernel = true;
if (cc.getSupports64BitGlobalAtomics())
donorKernel->addArg(cc.getLongForceBuffer()); donorKernel->addArg(cc.getLongForceBuffer());
else {
donorKernel->addArg(cc.getForceBuffers());
donorKernel->addArg(donorBufferIndices);
}
donorKernel->addArg(cc.getEnergyBuffer()); donorKernel->addArg(cc.getEnergyBuffer());
donorKernel->addArg(cc.getPosq()); donorKernel->addArg(cc.getPosq());
donorKernel->addArg(donorExclusions); donorKernel->addArg(donorExclusions);
...@@ -4264,12 +4125,7 @@ double CommonCalcCustomHbondForceKernel::execute(ContextImpl& context, bool incl ...@@ -4264,12 +4125,7 @@ double CommonCalcCustomHbondForceKernel::execute(ContextImpl& context, bool incl
donorKernel->addArg(parameter.getArray()); donorKernel->addArg(parameter.getArray());
for (auto& function : tabulatedFunctionArrays) for (auto& function : tabulatedFunctionArrays)
donorKernel->addArg(function); donorKernel->addArg(function);
if (cc.getSupports64BitGlobalAtomics())
acceptorKernel->addArg(cc.getLongForceBuffer()); acceptorKernel->addArg(cc.getLongForceBuffer());
else {
acceptorKernel->addArg(cc.getForceBuffers());
acceptorKernel->addArg(acceptorBufferIndices);
}
acceptorKernel->addArg(cc.getEnergyBuffer()); acceptorKernel->addArg(cc.getEnergyBuffer());
acceptorKernel->addArg(cc.getPosq()); acceptorKernel->addArg(cc.getPosq());
acceptorKernel->addArg(acceptorExclusions); acceptorKernel->addArg(acceptorExclusions);
...@@ -4286,9 +4142,9 @@ double CommonCalcCustomHbondForceKernel::execute(ContextImpl& context, bool incl ...@@ -4286,9 +4142,9 @@ double CommonCalcCustomHbondForceKernel::execute(ContextImpl& context, bool incl
for (auto& function : tabulatedFunctionArrays) for (auto& function : tabulatedFunctionArrays)
acceptorKernel->addArg(function); acceptorKernel->addArg(function);
} }
setPeriodicBoxArgs(cc, donorKernel, cc.getSupports64BitGlobalAtomics() ? 6 : 7); setPeriodicBoxArgs(cc, donorKernel, 6);
donorKernel->execute(max(numDonors, numAcceptors), 64); donorKernel->execute(max(numDonors, numAcceptors), 64);
setPeriodicBoxArgs(cc, acceptorKernel, cc.getSupports64BitGlobalAtomics() ? 6 : 7); setPeriodicBoxArgs(cc, acceptorKernel, 6);
acceptorKernel->execute(max(numDonors, numAcceptors), 64); acceptorKernel->execute(max(numDonors, numAcceptors), 64);
return 0.0; return 0.0;
} }
...@@ -4391,8 +4247,6 @@ CommonCalcCustomManyParticleForceKernel::~CommonCalcCustomManyParticleForceKerne ...@@ -4391,8 +4247,6 @@ CommonCalcCustomManyParticleForceKernel::~CommonCalcCustomManyParticleForceKerne
void CommonCalcCustomManyParticleForceKernel::initialize(const System& system, const CustomManyParticleForce& force) { void CommonCalcCustomManyParticleForceKernel::initialize(const System& system, const CustomManyParticleForce& force) {
ContextSelector selector(cc); ContextSelector selector(cc);
if (!cc.getSupports64BitGlobalAtomics())
throw OpenMMException("CustomManyParticleForce requires a device that supports 64 bit atomic operations");
int numParticles = force.getNumParticles(); int numParticles = force.getNumParticles();
int particlesPerSet = force.getNumParticlesPerSet(); int particlesPerSet = force.getNumParticlesPerSet();
bool centralParticleMode = (force.getPermutationMode() == CustomManyParticleForce::UniqueCentralParticle); bool centralParticleMode = (force.getPermutationMode() == CustomManyParticleForce::UniqueCentralParticle);
...@@ -4933,9 +4787,6 @@ private: ...@@ -4933,9 +4787,6 @@ private:
}; };
void CommonCalcGayBerneForceKernel::initialize(const System& system, const GayBerneForce& force) { void CommonCalcGayBerneForceKernel::initialize(const System& system, const GayBerneForce& force) {
if (!cc.getSupports64BitGlobalAtomics())
throw OpenMMException("GayBerneForce requires a device that supports 64 bit atomic operations");
// Initialize interactions. // Initialize interactions.
ContextSelector selector(cc); ContextSelector selector(cc);
......
...@@ -528,8 +528,6 @@ IntegrationUtilities::IntegrationUtilities(ComputeContext& context, const System ...@@ -528,8 +528,6 @@ IntegrationUtilities::IntegrationUtilities(ComputeContext& context, const System
for (int i = 0; i < numAtoms; i++) for (int i = 0; i < numAtoms; i++)
if (atomCounts[i] > 1) if (atomCounts[i] > 1)
hasOverlappingVsites = true; hasOverlappingVsites = true;
if (hasOverlappingVsites && !context.getSupports64BitGlobalAtomics())
throw OpenMMException("This device does not support 64 bit atomics. Cannot have multiple virtual sites that depend on the same atom.");
// Create the kernels used by this class. // Create the kernels used by this class.
......
#ifdef SUPPORTS_64_BIT_ATOMICS
#define STORE_DERIVATIVE_1(INDEX) ATOMIC_ADD(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(deriv##INDEX##_1)); #define STORE_DERIVATIVE_1(INDEX) ATOMIC_ADD(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(deriv##INDEX##_1));
#define STORE_DERIVATIVE_2(INDEX) ATOMIC_ADD(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(local_deriv##INDEX[LOCAL_ID])); #define STORE_DERIVATIVE_2(INDEX) ATOMIC_ADD(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(local_deriv##INDEX[LOCAL_ID]));
#else
#define STORE_DERIVATIVE_1(INDEX) derivBuffers##INDEX[offset] += deriv##INDEX##_1;
#define STORE_DERIVATIVE_2(INDEX) derivBuffers##INDEX[offset] += local_deriv##INDEX[LOCAL_ID];
#endif
/** /**
* Compute a force based on pair interactions. * Compute a force based on pair interactions.
*/ */
KERNEL void computeN2Energy( KERNEL void computeN2Energy(
#ifdef SUPPORTS_64_BIT_ATOMICS
GLOBAL mm_ulong* RESTRICT forceBuffers, GLOBAL mm_ulong* RESTRICT forceBuffers,
#else
GLOBAL real4* RESTRICT forceBuffers,
#endif
GLOBAL mixed* RESTRICT energyBuffer, GLOBAL mixed* RESTRICT energyBuffer,
GLOBAL const real4* RESTRICT posq, GLOBAL const unsigned int* RESTRICT exclusions, GLOBAL const real4* RESTRICT posq, GLOBAL const unsigned int* RESTRICT exclusions,
GLOBAL const int2* exclusionTiles, int needEnergy, GLOBAL const int2* exclusionTiles, int needEnergy,
...@@ -160,7 +151,6 @@ KERNEL void computeN2Energy( ...@@ -160,7 +151,6 @@ KERNEL void computeN2Energy(
// Write results. // Write results.
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = x*TILE_SIZE + tgx; unsigned int offset = x*TILE_SIZE + tgx;
ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) realToFixedPoint(force.x)); ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) realToFixedPoint(force.x));
ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y)); ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
...@@ -173,18 +163,6 @@ KERNEL void computeN2Energy( ...@@ -173,18 +163,6 @@ KERNEL void computeN2Energy(
ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(local_force[LOCAL_ID].z)); ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(local_force[LOCAL_ID].z));
STORE_DERIVATIVES_2 STORE_DERIVATIVES_2
} }
#else
unsigned int offset1 = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset = offset1;
forceBuffers[offset1].xyz += force.xyz;
STORE_DERIVATIVES_1
if (x != y) {
offset = offset2;
forceBuffers[offset2] += (real4) (local_force[LOCAL_ID].x, local_force[LOCAL_ID].y, local_force[LOCAL_ID].z, 0.0f);
STORE_DERIVATIVES_2
}
#endif
} }
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
...@@ -363,7 +341,6 @@ KERNEL void computeN2Energy( ...@@ -363,7 +341,6 @@ KERNEL void computeN2Energy(
#else #else
unsigned int atom2 = y*TILE_SIZE + tgx; unsigned int atom2 = y*TILE_SIZE + tgx;
#endif #endif
#ifdef SUPPORTS_64_BIT_ATOMICS
ATOMIC_ADD(&forceBuffers[atom1], (mm_ulong) realToFixedPoint(force.x)); ATOMIC_ADD(&forceBuffers[atom1], (mm_ulong) realToFixedPoint(force.x));
ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y)); ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.z)); ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.z));
...@@ -376,18 +353,6 @@ KERNEL void computeN2Energy( ...@@ -376,18 +353,6 @@ KERNEL void computeN2Energy(
offset = atom2; offset = atom2;
STORE_DERIVATIVES_2 STORE_DERIVATIVES_2
} }
#else
unsigned int offset1 = atom1 + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = atom2 + warp*PADDED_NUM_ATOMS;
forceBuffers[offset1].xyz += force.xyz;
unsigned int offset = offset1;
STORE_DERIVATIVES_1
if (atom2 < PADDED_NUM_ATOMS) {
forceBuffers[offset2] += (real4) (local_force[LOCAL_ID].x, local_force[LOCAL_ID].y, local_force[LOCAL_ID].z, 0.0f);
offset = offset2;
STORE_DERIVATIVES_2
}
#endif
} }
pos++; pos++;
} }
......
#ifdef SUPPORTS_64_BIT_ATOMICS
#define STORE_DERIVATIVE_1(INDEX) ATOMIC_ADD(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(deriv##INDEX##_1)); #define STORE_DERIVATIVE_1(INDEX) ATOMIC_ADD(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(deriv##INDEX##_1));
#define STORE_DERIVATIVE_2(INDEX) ATOMIC_ADD(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(local_deriv##INDEX[tgx])); #define STORE_DERIVATIVE_2(INDEX) ATOMIC_ADD(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(local_deriv##INDEX[tgx]));
#else
#define STORE_DERIVATIVE_1(INDEX) derivBuffers##INDEX[offset] += deriv##INDEX##_1;
#define STORE_DERIVATIVE_2(INDEX) derivBuffers##INDEX[offset] += local_deriv##INDEX[tgx];
#endif
/** /**
* Compute a force based on pair interactions. * Compute a force based on pair interactions.
*/ */
KERNEL void computeN2Energy( KERNEL void computeN2Energy(
#ifdef SUPPORTS_64_BIT_ATOMICS
GLOBAL mm_ulong* RESTRICT forceBuffers, GLOBAL mm_ulong* RESTRICT forceBuffers,
#else
GLOBAL real4* RESTRICT forceBuffers,
#endif
GLOBAL mixed* RESTRICT energyBuffer, GLOBAL mixed* RESTRICT energyBuffer,
GLOBAL const real4* RESTRICT posq, GLOBAL const unsigned int* RESTRICT exclusions, GLOBAL const real4* RESTRICT posq, GLOBAL const unsigned int* RESTRICT exclusions,
GLOBAL const int2* exclusionTiles, int needEnergy, GLOBAL const int2* exclusionTiles, int needEnergy,
...@@ -100,17 +91,11 @@ KERNEL void computeN2Energy( ...@@ -100,17 +91,11 @@ KERNEL void computeN2Energy(
// Write results. // Write results.
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = atom1; unsigned int offset = atom1;
ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) realToFixedPoint(force.x)); ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) realToFixedPoint(force.x));
ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y)); ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.z)); ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.z));
STORE_DERIVATIVES_1 STORE_DERIVATIVES_1
#else
unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz += force.xyz;
STORE_DERIVATIVES_1
#endif
} }
} }
else { else {
...@@ -174,33 +159,21 @@ KERNEL void computeN2Energy( ...@@ -174,33 +159,21 @@ KERNEL void computeN2Energy(
// Write results for atom1. // Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = atom1; unsigned int offset = atom1;
ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) realToFixedPoint(force.x)); ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) realToFixedPoint(force.x));
ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y)); ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.z)); ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.z));
STORE_DERIVATIVES_1 STORE_DERIVATIVES_1
#else
unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz += force.xyz;
STORE_DERIVATIVES_1
#endif
} }
// Write results. // Write results.
for (int tgx = 0; tgx < TILE_SIZE; tgx++) { for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = y*TILE_SIZE+tgx; unsigned int offset = y*TILE_SIZE+tgx;
ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) realToFixedPoint(local_force[tgx].x)); ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) realToFixedPoint(local_force[tgx].x));
ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(local_force[tgx].y)); ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(local_force[tgx].y));
ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(local_force[tgx].z)); ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(local_force[tgx].z));
STORE_DERIVATIVES_2 STORE_DERIVATIVES_2
#else
unsigned int offset = y*TILE_SIZE+tgx + GROUP_ID*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz += local_force[tgx].xyz;
STORE_DERIVATIVES_2
#endif
} }
} }
} }
...@@ -316,17 +289,11 @@ KERNEL void computeN2Energy( ...@@ -316,17 +289,11 @@ KERNEL void computeN2Energy(
// Write results for atom1. // Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = atom1; unsigned int offset = atom1;
ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) realToFixedPoint(force.x)); ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) realToFixedPoint(force.x));
ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y)); ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.z)); ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.z));
STORE_DERIVATIVES_1 STORE_DERIVATIVES_1
#else
unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz += force.xyz;
STORE_DERIVATIVES_1
#endif
} }
} }
else else
...@@ -375,17 +342,11 @@ KERNEL void computeN2Energy( ...@@ -375,17 +342,11 @@ KERNEL void computeN2Energy(
// Write results for atom1. // Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = atom1; unsigned int offset = atom1;
ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) realToFixedPoint(force.x)); ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) realToFixedPoint(force.x));
ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y)); ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.z)); ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.z));
STORE_DERIVATIVES_1 STORE_DERIVATIVES_1
#else
unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz += force.xyz;
STORE_DERIVATIVES_1
#endif
} }
} }
...@@ -398,17 +359,11 @@ KERNEL void computeN2Energy( ...@@ -398,17 +359,11 @@ KERNEL void computeN2Energy(
unsigned int atom2 = y*TILE_SIZE + tgx; unsigned int atom2 = y*TILE_SIZE + tgx;
#endif #endif
if (atom2 < PADDED_NUM_ATOMS) { if (atom2 < PADDED_NUM_ATOMS) {
#ifdef SUPPORTS_64_BIT_ATOMICS
ATOMIC_ADD(&forceBuffers[atom2], (mm_ulong) realToFixedPoint(local_force[tgx].x)); ATOMIC_ADD(&forceBuffers[atom2], (mm_ulong) realToFixedPoint(local_force[tgx].x));
ATOMIC_ADD(&forceBuffers[atom2+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(local_force[tgx].y)); ATOMIC_ADD(&forceBuffers[atom2+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(local_force[tgx].y));
ATOMIC_ADD(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(local_force[tgx].z)); ATOMIC_ADD(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(local_force[tgx].z));
unsigned int offset = atom2; unsigned int offset = atom2;
STORE_DERIVATIVES_2 STORE_DERIVATIVES_2
#else
unsigned int offset = atom2 + GROUP_ID*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz += local_force[tgx].xyz;
STORE_DERIVATIVES_2
#endif
} }
} }
} }
......
...@@ -10,20 +10,13 @@ ...@@ -10,20 +10,13 @@
*/ */
KERNEL void computePerParticleEnergy(GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq, KERNEL void computePerParticleEnergy(GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq,
#ifdef SUPPORTS_64_BIT_ATOMICS
GLOBAL mm_long* RESTRICT forceBuffers GLOBAL mm_long* RESTRICT forceBuffers
#else
GLOBAL real4* RESTRICT forceBuffers, int bufferSize, int numBuffers
#endif
PARAMETER_ARGUMENTS) { PARAMETER_ARGUMENTS) {
mixed energy = 0; mixed energy = 0;
INIT_PARAM_DERIVS INIT_PARAM_DERIVS
for (int index = GLOBAL_ID; index < NUM_ATOMS; index += GLOBAL_SIZE) { for (int index = GLOBAL_ID; index < NUM_ATOMS; index += GLOBAL_SIZE) {
// Reduce the derivatives // Reduce the derivatives
#ifndef SUPPORTS_64_BIT_ATOMICS
int totalSize = bufferSize*numBuffers;
#endif
REDUCE_DERIVATIVES REDUCE_DERIVATIVES
// Now calculate the per-particle energy terms. // Now calculate the per-particle energy terms.
......
...@@ -3,29 +3,17 @@ ...@@ -3,29 +3,17 @@
*/ */
KERNEL void computeGradientChainRuleTerms(GLOBAL const real4* RESTRICT posq, KERNEL void computeGradientChainRuleTerms(GLOBAL const real4* RESTRICT posq,
#ifdef SUPPORTS_64_BIT_ATOMICS
GLOBAL mm_long* RESTRICT forceBuffers GLOBAL mm_long* RESTRICT forceBuffers
#else
GLOBAL real4* RESTRICT forceBuffers
#endif
PARAMETER_ARGUMENTS) { PARAMETER_ARGUMENTS) {
INIT_PARAM_DERIVS INIT_PARAM_DERIVS
const real scale = RECIP((real) 0x100000000); const real scale = RECIP((real) 0x100000000);
for (int index = GLOBAL_ID; index < NUM_ATOMS; index += GLOBAL_SIZE) { for (int index = GLOBAL_ID; index < NUM_ATOMS; index += GLOBAL_SIZE) {
real4 pos = posq[index]; real4 pos = posq[index];
#ifdef SUPPORTS_64_BIT_ATOMICS
real3 force = make_real3(scale*forceBuffers[index], scale*forceBuffers[index+PADDED_NUM_ATOMS], scale*forceBuffers[index+PADDED_NUM_ATOMS*2]); real3 force = make_real3(scale*forceBuffers[index], scale*forceBuffers[index+PADDED_NUM_ATOMS], scale*forceBuffers[index+PADDED_NUM_ATOMS*2]);
#else
real3 force = trimTo3(forceBuffers[index]);
#endif
COMPUTE_FORCES COMPUTE_FORCES
#ifdef SUPPORTS_64_BIT_ATOMICS
forceBuffers[index] = realToFixedPoint(force.x); forceBuffers[index] = realToFixedPoint(force.x);
forceBuffers[index+PADDED_NUM_ATOMS] = realToFixedPoint(force.y); forceBuffers[index+PADDED_NUM_ATOMS] = realToFixedPoint(force.y);
forceBuffers[index+PADDED_NUM_ATOMS*2] = realToFixedPoint(force.z); forceBuffers[index+PADDED_NUM_ATOMS*2] = realToFixedPoint(force.z);
#else
forceBuffers[index] = make_real4(force.x, force.y, force.z, 0);
#endif
} }
SAVE_PARAM_DERIVS SAVE_PARAM_DERIVS
} }
...@@ -3,11 +3,7 @@ ...@@ -3,11 +3,7 @@
*/ */
KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsigned int* RESTRICT exclusions, KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsigned int* RESTRICT exclusions,
GLOBAL const int2* exclusionTiles, GLOBAL const int2* exclusionTiles,
#ifdef SUPPORTS_64_BIT_ATOMICS
GLOBAL mm_ulong* RESTRICT global_value, GLOBAL mm_ulong* RESTRICT global_value,
#else
GLOBAL real* RESTRICT global_value,
#endif
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, GLOBAL const real4* RESTRICT blockCenter, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, GLOBAL const real4* RESTRICT blockCenter,
...@@ -137,7 +133,6 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig ...@@ -137,7 +133,6 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig
// Write results. // Write results.
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset1 = x*TILE_SIZE + tgx; unsigned int offset1 = x*TILE_SIZE + tgx;
ATOMIC_ADD(&global_value[offset1], (mm_ulong) realToFixedPoint(value)); ATOMIC_ADD(&global_value[offset1], (mm_ulong) realToFixedPoint(value));
STORE_PARAM_DERIVS1 STORE_PARAM_DERIVS1
...@@ -146,16 +141,6 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig ...@@ -146,16 +141,6 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig
ATOMIC_ADD(&global_value[offset2], (mm_ulong) realToFixedPoint(local_value[LOCAL_ID])); ATOMIC_ADD(&global_value[offset2], (mm_ulong) realToFixedPoint(local_value[LOCAL_ID]));
STORE_PARAM_DERIVS2 STORE_PARAM_DERIVS2
} }
#else
unsigned int offset1 = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
global_value[offset1] += value;
STORE_PARAM_DERIVS1
if (x != y) {
global_value[offset2] += local_value[LOCAL_ID];
STORE_PARAM_DERIVS2
}
#endif
} }
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
...@@ -317,7 +302,6 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig ...@@ -317,7 +302,6 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig
#else #else
unsigned int atom2 = y*TILE_SIZE + tgx; unsigned int atom2 = y*TILE_SIZE + tgx;
#endif #endif
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset1 = atom1; unsigned int offset1 = atom1;
ATOMIC_ADD(&global_value[offset1], (mm_ulong) realToFixedPoint(value)); ATOMIC_ADD(&global_value[offset1], (mm_ulong) realToFixedPoint(value));
STORE_PARAM_DERIVS1 STORE_PARAM_DERIVS1
...@@ -326,16 +310,6 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig ...@@ -326,16 +310,6 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig
ATOMIC_ADD(&global_value[offset2], (mm_ulong) realToFixedPoint(local_value[LOCAL_ID])); ATOMIC_ADD(&global_value[offset2], (mm_ulong) realToFixedPoint(local_value[LOCAL_ID]));
STORE_PARAM_DERIVS2 STORE_PARAM_DERIVS2
} }
#else
unsigned int offset1 = atom1 + warp*PADDED_NUM_ATOMS;
global_value[offset1] += value;
STORE_PARAM_DERIVS1
if (atom2 < PADDED_NUM_ATOMS) {
unsigned int offset2 = atom2 + warp*PADDED_NUM_ATOMS;
global_value[offset2] += local_value[LOCAL_ID];
STORE_PARAM_DERIVS2
}
#endif
} }
pos++; pos++;
} }
......
...@@ -3,11 +3,7 @@ ...@@ -3,11 +3,7 @@
*/ */
KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsigned int* RESTRICT exclusions, KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsigned int* RESTRICT exclusions,
GLOBAL const int2* exclusionTiles, GLOBAL const int2* exclusionTiles,
#ifdef SUPPORTS_64_BIT_ATOMICS
GLOBAL mm_ulong* RESTRICT global_value, GLOBAL mm_ulong* RESTRICT global_value,
#else
GLOBAL real* RESTRICT global_value,
#endif
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, GLOBAL const real4* RESTRICT blockCenter, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, GLOBAL const real4* RESTRICT blockCenter,
...@@ -84,13 +80,8 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig ...@@ -84,13 +80,8 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig
// Write results. // Write results.
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset1 = atom1; unsigned int offset1 = atom1;
ATOMIC_ADD(&global_value[offset1], (mm_ulong) realToFixedPoint(value)); ATOMIC_ADD(&global_value[offset1], (mm_ulong) realToFixedPoint(value));
#else
unsigned int offset1 = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
global_value[offset1] += value;
#endif
STORE_PARAM_DERIVS1 STORE_PARAM_DERIVS1
} }
} }
...@@ -146,26 +137,16 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig ...@@ -146,26 +137,16 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig
// Write results for atom1. // Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset1 = atom1; unsigned int offset1 = atom1;
ATOMIC_ADD(&global_value[offset1], (mm_ulong) realToFixedPoint(value)); ATOMIC_ADD(&global_value[offset1], (mm_ulong) realToFixedPoint(value));
#else
unsigned int offset1 = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
global_value[offset1] += value;
#endif
STORE_PARAM_DERIVS1 STORE_PARAM_DERIVS1
} }
// Write results. // Write results.
for (int tgx = 0; tgx < TILE_SIZE; tgx++) { for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset2 = y*TILE_SIZE+tgx; unsigned int offset2 = y*TILE_SIZE+tgx;
ATOMIC_ADD(&global_value[offset2], (mm_ulong) realToFixedPoint(local_value[tgx])); ATOMIC_ADD(&global_value[offset2], (mm_ulong) realToFixedPoint(local_value[tgx]));
#else
unsigned int offset2 = y*TILE_SIZE+tgx + get_group_id(0)*PADDED_NUM_ATOMS;
global_value[offset2] += local_value[tgx];
#endif
STORE_PARAM_DERIVS2 STORE_PARAM_DERIVS2
} }
} }
...@@ -273,13 +254,8 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig ...@@ -273,13 +254,8 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig
// Write results for atom1. // Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset1 = atom1; unsigned int offset1 = atom1;
ATOMIC_ADD(&global_value[offset1], (mm_ulong) realToFixedPoint(value)); ATOMIC_ADD(&global_value[offset1], (mm_ulong) realToFixedPoint(value));
#else
unsigned int offset1 = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
global_value[offset1] += value;
#endif
STORE_PARAM_DERIVS1 STORE_PARAM_DERIVS1
} }
} }
...@@ -322,13 +298,8 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig ...@@ -322,13 +298,8 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig
// Write results for atom1. // Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset1 = atom1; unsigned int offset1 = atom1;
ATOMIC_ADD(&global_value[offset1], (mm_ulong) realToFixedPoint(value)); ATOMIC_ADD(&global_value[offset1], (mm_ulong) realToFixedPoint(value));
#else
unsigned int offset1 = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
global_value[offset1] += value;
#endif
STORE_PARAM_DERIVS1 STORE_PARAM_DERIVS1
} }
} }
...@@ -342,13 +313,8 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig ...@@ -342,13 +313,8 @@ KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsig
unsigned int atom2 = y*TILE_SIZE + tgx; unsigned int atom2 = y*TILE_SIZE + tgx;
#endif #endif
if (atom2 < PADDED_NUM_ATOMS) { if (atom2 < PADDED_NUM_ATOMS) {
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset2 = atom2; unsigned int offset2 = atom2;
ATOMIC_ADD(&global_value[offset2], (mm_ulong) realToFixedPoint(local_value[tgx])); ATOMIC_ADD(&global_value[offset2], (mm_ulong) realToFixedPoint(local_value[tgx]));
#else
unsigned int offset2 = atom2 + get_group_id(0)*PADDED_NUM_ATOMS;
global_value[offset2] += local_value[tgx];
#endif
STORE_PARAM_DERIVS2 STORE_PARAM_DERIVS2
} }
} }
......
...@@ -3,23 +3,12 @@ ...@@ -3,23 +3,12 @@
*/ */
KERNEL void computePerParticleValues(GLOBAL real4* posq, KERNEL void computePerParticleValues(GLOBAL real4* posq,
#ifdef SUPPORTS_64_BIT_ATOMICS
GLOBAL mm_long* valueBuffers GLOBAL mm_long* valueBuffers
#else
GLOBAL real* valueBuffers, int bufferSize, int numBuffers
#endif
PARAMETER_ARGUMENTS) { PARAMETER_ARGUMENTS) {
for (int index = GLOBAL_ID; index < NUM_ATOMS; index += GLOBAL_SIZE) { for (int index = GLOBAL_ID; index < NUM_ATOMS; index += GLOBAL_SIZE) {
// Reduce the pairwise value // Reduce the pairwise value
#ifdef SUPPORTS_64_BIT_ATOMICS
real sum = valueBuffers[index]/(real) 0x100000000; real sum = valueBuffers[index]/(real) 0x100000000;
#else
int totalSize = bufferSize*numBuffers;
real sum = valueBuffers[index];
for (int i = index+bufferSize; i < totalSize; i += bufferSize)
sum += valueBuffers[i];
#endif
REDUCE_PARAM0_DERIV REDUCE_PARAM0_DERIV
// Now calculate other values // Now calculate other values
......
...@@ -44,11 +44,7 @@ inline DEVICE real4 computeCross(real4 vec1, real4 vec2) { ...@@ -44,11 +44,7 @@ inline DEVICE real4 computeCross(real4 vec1, real4 vec2) {
* Compute forces on donors. * Compute forces on donors.
*/ */
KERNEL void computeDonorForces( KERNEL void computeDonorForces(
#ifdef SUPPORTS_64_BIT_ATOMICS
GLOBAL mm_ulong* RESTRICT force, GLOBAL mm_ulong* RESTRICT force,
#else
GLOBAL real4* RESTRICT forceBuffers, GLOBAL const int4* RESTRICT donorBufferIndices,
#endif
GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq, GLOBAL const int4* RESTRICT exclusions, GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq, GLOBAL const int4* RESTRICT exclusions,
GLOBAL const int4* RESTRICT donorAtoms, GLOBAL const int4* RESTRICT acceptorAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, GLOBAL const int4* RESTRICT donorAtoms, GLOBAL const int4* RESTRICT acceptorAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
...@@ -114,7 +110,6 @@ KERNEL void computeDonorForces( ...@@ -114,7 +110,6 @@ KERNEL void computeDonorForces(
// Write results // Write results
if (donorIndex < NUM_DONORS) { if (donorIndex < NUM_DONORS) {
#ifdef SUPPORTS_64_BIT_ATOMICS
if (atoms.x > -1) { if (atoms.x > -1) {
ATOMIC_ADD(&force[atoms.x], (mm_ulong) realToFixedPoint(f1.x)); ATOMIC_ADD(&force[atoms.x], (mm_ulong) realToFixedPoint(f1.x));
ATOMIC_ADD(&force[atoms.x+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(f1.y)); ATOMIC_ADD(&force[atoms.x+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(f1.y));
...@@ -133,27 +128,6 @@ KERNEL void computeDonorForces( ...@@ -133,27 +128,6 @@ KERNEL void computeDonorForces(
ATOMIC_ADD(&force[atoms.z+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(f3.z)); ATOMIC_ADD(&force[atoms.z+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(f3.z));
MEM_FENCE; MEM_FENCE;
} }
#else
int4 bufferIndices = donorBufferIndices[donorIndex];
if (atoms.x > -1) {
unsigned int offset = atoms.x+bufferIndices.x*PADDED_NUM_ATOMS;
real4 force = forceBuffers[offset];
force.xyz += f1.xyz;
forceBuffers[offset] = force;
}
if (atoms.y > -1) {
unsigned int offset = atoms.y+bufferIndices.y*PADDED_NUM_ATOMS;
real4 force = forceBuffers[offset];
force.xyz += f2.xyz;
forceBuffers[offset] = force;
}
if (atoms.z > -1) {
unsigned int offset = atoms.z+bufferIndices.z*PADDED_NUM_ATOMS;
real4 force = forceBuffers[offset];
force.xyz += f3.xyz;
forceBuffers[offset] = force;
}
#endif
} }
} }
energyBuffer[GLOBAL_ID] += energy; energyBuffer[GLOBAL_ID] += energy;
...@@ -162,11 +136,7 @@ KERNEL void computeDonorForces( ...@@ -162,11 +136,7 @@ KERNEL void computeDonorForces(
* Compute forces on acceptors. * Compute forces on acceptors.
*/ */
KERNEL void computeAcceptorForces( KERNEL void computeAcceptorForces(
#ifdef SUPPORTS_64_BIT_ATOMICS
GLOBAL mm_ulong* RESTRICT force, GLOBAL mm_ulong* RESTRICT force,
#else
GLOBAL real4* RESTRICT forceBuffers, GLOBAL const int4* RESTRICT acceptorBufferIndices,
#endif
GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq, GLOBAL const int4* RESTRICT exclusions, GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq, GLOBAL const int4* RESTRICT exclusions,
GLOBAL const int4* RESTRICT donorAtoms, GLOBAL const int4* RESTRICT acceptorAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, GLOBAL const int4* RESTRICT donorAtoms, GLOBAL const int4* RESTRICT acceptorAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
...@@ -231,7 +201,6 @@ KERNEL void computeAcceptorForces( ...@@ -231,7 +201,6 @@ KERNEL void computeAcceptorForces(
// Write results // Write results
if (acceptorIndex < NUM_ACCEPTORS) { if (acceptorIndex < NUM_ACCEPTORS) {
#ifdef SUPPORTS_64_BIT_ATOMICS
if (atoms.x > -1) { if (atoms.x > -1) {
ATOMIC_ADD(&force[atoms.x], (mm_ulong) realToFixedPoint(f1.x)); ATOMIC_ADD(&force[atoms.x], (mm_ulong) realToFixedPoint(f1.x));
ATOMIC_ADD(&force[atoms.x+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(f1.y)); ATOMIC_ADD(&force[atoms.x+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(f1.y));
...@@ -250,27 +219,6 @@ KERNEL void computeAcceptorForces( ...@@ -250,27 +219,6 @@ KERNEL void computeAcceptorForces(
ATOMIC_ADD(&force[atoms.z+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(f3.z)); ATOMIC_ADD(&force[atoms.z+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(f3.z));
MEM_FENCE; MEM_FENCE;
} }
#else
int4 bufferIndices = acceptorBufferIndices[acceptorIndex];
if (atoms.x > -1) {
unsigned int offset = atoms.x+bufferIndices.x*PADDED_NUM_ATOMS;
real4 force = forceBuffers[offset];
force.xyz += f1.xyz;
forceBuffers[offset] = force;
}
if (atoms.y > -1) {
unsigned int offset = atoms.y+bufferIndices.y*PADDED_NUM_ATOMS;
real4 force = forceBuffers[offset];
force.xyz += f2.xyz;
forceBuffers[offset] = force;
}
if (atoms.z > -1) {
unsigned int offset = atoms.z+bufferIndices.z*PADDED_NUM_ATOMS;
real4 force = forceBuffers[offset];
force.xyz += f3.xyz;
forceBuffers[offset] = force;
}
#endif
} }
} }
} }
...@@ -35,38 +35,8 @@ DEVICE int reduceMax(int val, LOCAL_ARG int* temp) { ...@@ -35,38 +35,8 @@ DEVICE int reduceMax(int val, LOCAL_ARG int* temp) {
#endif #endif
} }
#ifndef SUPPORTS_64_BIT_ATOMICS
/**
* This function is used on devices that don't support 64 bit atomics. Multiple threads within
* a single tile might have computed forces on the same atom. This loops over them and makes sure
* that only one thread updates the force on any given atom.
*/
void writeForces(GLOBAL real4* forceBuffers, LOCAL AtomData* localData, int atomIndex) {
localData[LOCAL_ID].x = atomIndex;
SYNC_WARPS;
real4 forceSum = make_real4(0);
int start = (LOCAL_ID/TILE_SIZE)*TILE_SIZE;
int end = start+32;
bool isFirst = true;
for (int i = start; i < end; i++)
if (localData[i].x == atomIndex) {
forceSum += (real4) (localData[i].fx, localData[i].fy, localData[i].fz, 0);
isFirst &= (i >= LOCAL_ID);
}
const unsigned int warp = GLOBAL_ID/TILE_SIZE;
unsigned int offset = atomIndex + warp*PADDED_NUM_ATOMS;
if (isFirst)
forceBuffers[offset] += forceSum;
SYNC_WARPS;
}
#endif
KERNEL void computeInteractionGroups( KERNEL void computeInteractionGroups(
#ifdef SUPPORTS_64_BIT_ATOMICS
GLOBAL mm_ulong* RESTRICT forceBuffers, GLOBAL mm_ulong* RESTRICT forceBuffers,
#else
GLOBAL real4* RESTRICT forceBuffers,
#endif
GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq, GLOBAL const int4* RESTRICT groupData, GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq, GLOBAL const int4* RESTRICT groupData,
GLOBAL const int* RESTRICT numGroupTiles, int useNeighborList, GLOBAL const int* RESTRICT numGroupTiles, int useNeighborList,
real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
...@@ -139,7 +109,6 @@ KERNEL void computeInteractionGroups( ...@@ -139,7 +109,6 @@ KERNEL void computeInteractionGroups(
} }
SYNC_WARPS; SYNC_WARPS;
} }
#ifdef SUPPORTS_64_BIT_ATOMICS
if (exclusions != 0) { if (exclusions != 0) {
ATOMIC_ADD(&forceBuffers[atom1], (mm_ulong) realToFixedPoint(force.x)); ATOMIC_ADD(&forceBuffers[atom1], (mm_ulong) realToFixedPoint(force.x));
ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y)); ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
...@@ -149,13 +118,6 @@ KERNEL void computeInteractionGroups( ...@@ -149,13 +118,6 @@ KERNEL void computeInteractionGroups(
ATOMIC_ADD(&forceBuffers[atom2+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(localData[LOCAL_ID].fy)); ATOMIC_ADD(&forceBuffers[atom2+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(localData[LOCAL_ID].fy));
ATOMIC_ADD(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(localData[LOCAL_ID].fz)); ATOMIC_ADD(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(localData[LOCAL_ID].fz));
SYNC_WARPS; SYNC_WARPS;
#else
writeForces(forceBuffers, localData, atom2);
localData[LOCAL_ID].fx = force.x;
localData[LOCAL_ID].fy = force.y;
localData[LOCAL_ID].fz = force.z;
writeForces(forceBuffers, localData, atom1);
#endif
} }
energyBuffer[GLOBAL_ID] += energy; energyBuffer[GLOBAL_ID] += energy;
SAVE_DERIVATIVES SAVE_DERIVATIVES
......
...@@ -17,11 +17,7 @@ typedef struct ALIGN { ...@@ -17,11 +17,7 @@ typedef struct ALIGN {
* Compute the Born sum. * Compute the Born sum.
*/ */
KERNEL void computeBornSum( KERNEL void computeBornSum(
#ifdef SUPPORTS_64_BIT_ATOMICS
GLOBAL mm_ulong* RESTRICT global_bornSum, GLOBAL mm_ulong* RESTRICT global_bornSum,
#else
GLOBAL real* RESTRICT global_bornSum,
#endif
GLOBAL const real4* RESTRICT posq, GLOBAL const real* RESTRICT charge, GLOBAL const float2* RESTRICT global_params, GLOBAL const real4* RESTRICT posq, GLOBAL const real* RESTRICT charge, GLOBAL const float2* RESTRICT global_params,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
...@@ -152,20 +148,12 @@ KERNEL void computeBornSum( ...@@ -152,20 +148,12 @@ KERNEL void computeBornSum(
// Write results. // Write results.
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = x*TILE_SIZE + tgx; unsigned int offset = x*TILE_SIZE + tgx;
ATOMIC_ADD(&global_bornSum[offset], (mm_ulong) realToFixedPoint(bornSum)); ATOMIC_ADD(&global_bornSum[offset], (mm_ulong) realToFixedPoint(bornSum));
if (x != y) { if (x != y) {
offset = y*TILE_SIZE + tgx; offset = y*TILE_SIZE + tgx;
ATOMIC_ADD(&global_bornSum[offset], (mm_ulong) realToFixedPoint(localData[LOCAL_ID].bornSum)); ATOMIC_ADD(&global_bornSum[offset], (mm_ulong) realToFixedPoint(localData[LOCAL_ID].bornSum));
} }
#else
unsigned int offset1 = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
global_bornSum[offset1] += bornSum;
if (x != y)
global_bornSum[offset2] += localData[LOCAL_ID].bornSum;
#endif
} }
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
...@@ -357,17 +345,9 @@ KERNEL void computeBornSum( ...@@ -357,17 +345,9 @@ KERNEL void computeBornSum(
#else #else
unsigned int atom2 = y*TILE_SIZE + tgx; unsigned int atom2 = y*TILE_SIZE + tgx;
#endif #endif
#ifdef SUPPORTS_64_BIT_ATOMICS
ATOMIC_ADD(&global_bornSum[atom1], (mm_ulong) realToFixedPoint(bornSum)); ATOMIC_ADD(&global_bornSum[atom1], (mm_ulong) realToFixedPoint(bornSum));
if (atom2 < PADDED_NUM_ATOMS) if (atom2 < PADDED_NUM_ATOMS)
ATOMIC_ADD(&global_bornSum[atom2], (mm_ulong) realToFixedPoint(localData[LOCAL_ID].bornSum)); ATOMIC_ADD(&global_bornSum[atom2], (mm_ulong) realToFixedPoint(localData[LOCAL_ID].bornSum));
#else
unsigned int offset1 = atom1 + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = atom2 + warp*PADDED_NUM_ATOMS;
global_bornSum[offset1] += bornSum;
if (atom2 < PADDED_NUM_ATOMS)
global_bornSum[offset2] += localData[LOCAL_ID].bornSum;
#endif
} }
pos++; pos++;
} }
...@@ -385,11 +365,7 @@ typedef struct ALIGN { ...@@ -385,11 +365,7 @@ typedef struct ALIGN {
*/ */
KERNEL void computeGBSAForce1( KERNEL void computeGBSAForce1(
#ifdef SUPPORTS_64_BIT_ATOMICS
GLOBAL mm_ulong* RESTRICT forceBuffers, GLOBAL mm_ulong* RESTRICT global_bornForce, GLOBAL mm_ulong* RESTRICT forceBuffers, GLOBAL mm_ulong* RESTRICT global_bornForce,
#else
GLOBAL real4* RESTRICT forceBuffers, GLOBAL real* RESTRICT global_bornForce,
#endif
GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq, GLOBAL const real* RESTRICT charge, GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq, GLOBAL const real* RESTRICT charge,
GLOBAL const real* RESTRICT global_bornRadii, int needEnergy, GLOBAL const real* RESTRICT global_bornRadii, int needEnergy,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
...@@ -538,7 +514,6 @@ KERNEL void computeGBSAForce1( ...@@ -538,7 +514,6 @@ KERNEL void computeGBSAForce1(
// Write results. // Write results.
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = x*TILE_SIZE + tgx; unsigned int offset = x*TILE_SIZE + tgx;
ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) realToFixedPoint(force.x)); ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) realToFixedPoint(force.x));
ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y)); ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
...@@ -551,16 +526,6 @@ KERNEL void computeGBSAForce1( ...@@ -551,16 +526,6 @@ KERNEL void computeGBSAForce1(
ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(localData[LOCAL_ID].fz)); ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(localData[LOCAL_ID].fz));
ATOMIC_ADD(&global_bornForce[offset], (mm_ulong) realToFixedPoint(localData[LOCAL_ID].fw)); ATOMIC_ADD(&global_bornForce[offset], (mm_ulong) realToFixedPoint(localData[LOCAL_ID].fw));
} }
#else
unsigned int offset1 = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
forceBuffers[offset1] += make_real4(force.x, force.y, force.z, 0);
global_bornForce[offset1] += force.w;
if (x != y) {
forceBuffers[offset2] += (real4) (localData[LOCAL_ID].fx, localData[LOCAL_ID].fy, localData[LOCAL_ID].fz, 0.0f);
global_bornForce[offset2] += localData[LOCAL_ID].fw;
}
#endif
} }
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
...@@ -763,7 +728,6 @@ KERNEL void computeGBSAForce1( ...@@ -763,7 +728,6 @@ KERNEL void computeGBSAForce1(
#else #else
unsigned int atom2 = y*TILE_SIZE + tgx; unsigned int atom2 = y*TILE_SIZE + tgx;
#endif #endif
#ifdef SUPPORTS_64_BIT_ATOMICS
ATOMIC_ADD(&forceBuffers[atom1], (mm_ulong) realToFixedPoint(force.x)); ATOMIC_ADD(&forceBuffers[atom1], (mm_ulong) realToFixedPoint(force.x));
ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y)); ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.z)); ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.z));
...@@ -774,16 +738,6 @@ KERNEL void computeGBSAForce1( ...@@ -774,16 +738,6 @@ KERNEL void computeGBSAForce1(
ATOMIC_ADD(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(localData[LOCAL_ID].fz)); ATOMIC_ADD(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(localData[LOCAL_ID].fz));
ATOMIC_ADD(&global_bornForce[atom2], (mm_ulong) realToFixedPoint(localData[LOCAL_ID].fw)); ATOMIC_ADD(&global_bornForce[atom2], (mm_ulong) realToFixedPoint(localData[LOCAL_ID].fw));
} }
#else
unsigned int offset1 = atom1 + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = atom2 + warp*PADDED_NUM_ATOMS;
forceBuffers[offset1] += make_real4(force.x, force.y, force.z, 0);
global_bornForce[offset1] += force.w;
if (atom2 < PADDED_NUM_ATOMS) {
forceBuffers[offset2] += (real4) (localData[LOCAL_ID].fx, localData[LOCAL_ID].fy, localData[LOCAL_ID].fz, 0.0f);
global_bornForce[offset2] += localData[LOCAL_ID].fw;
}
#endif
} }
pos++; pos++;
} }
......
...@@ -16,13 +16,8 @@ ...@@ -16,13 +16,8 @@
real t2I = (l_ij2I-u_ij2I); real t2I = (l_ij2I-u_ij2I);
real term1 = (0.5f*(0.25f+OBC_PARAMS2.y*OBC_PARAMS2.y*invRSquaredOver4)*t2J + t1J*invRSquaredOver4)*invR; real term1 = (0.5f*(0.25f+OBC_PARAMS2.y*OBC_PARAMS2.y*invRSquaredOver4)*t2J + t1J*invRSquaredOver4)*invR;
real term2 = (0.5f*(0.25f+OBC_PARAMS1.y*OBC_PARAMS1.y*invRSquaredOver4)*t2I + t1I*invRSquaredOver4)*invR; real term2 = (0.5f*(0.25f+OBC_PARAMS1.y*OBC_PARAMS1.y*invRSquaredOver4)*t2I + t1I*invRSquaredOver4)*invR;
#ifdef SUPPORTS_64_BIT_ATOMICS
real tempdEdR = (OBC_PARAMS1.x < rScaledRadiusJ ? BORN_FORCE1*term1/0x100000000 : 0); real tempdEdR = (OBC_PARAMS1.x < rScaledRadiusJ ? BORN_FORCE1*term1/0x100000000 : 0);
tempdEdR += (OBC_PARAMS2.x < rScaledRadiusI ? BORN_FORCE2*term2/0x100000000 : 0); tempdEdR += (OBC_PARAMS2.x < rScaledRadiusI ? BORN_FORCE2*term2/0x100000000 : 0);
#else
real tempdEdR = (OBC_PARAMS1.x < rScaledRadiusJ ? BORN_FORCE1*term1 : (real) 0);
tempdEdR += (OBC_PARAMS2.x < rScaledRadiusI ? BORN_FORCE2*term2 : (real) 0);
#endif
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int includeInteraction = (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2 && r2 < CUTOFF_SQUARED); unsigned int includeInteraction = (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2 && r2 < CUTOFF_SQUARED);
#else #else
......
...@@ -6,23 +6,12 @@ ...@@ -6,23 +6,12 @@
*/ */
KERNEL void reduceBornSum(float alpha, float beta, float gamma, KERNEL void reduceBornSum(float alpha, float beta, float gamma,
#ifdef SUPPORTS_64_BIT_ATOMICS
GLOBAL const mm_long* RESTRICT bornSum, GLOBAL const mm_long* RESTRICT bornSum,
#else
GLOBAL const real* RESTRICT bornSum, int bufferSize, int numBuffers,
#endif
GLOBAL const float2* RESTRICT params, GLOBAL real* RESTRICT bornRadii, GLOBAL real* RESTRICT obcChain) { GLOBAL const float2* RESTRICT params, GLOBAL real* RESTRICT bornRadii, GLOBAL real* RESTRICT obcChain) {
for (unsigned int index = GLOBAL_ID; index < NUM_ATOMS; index += GLOBAL_SIZE) { for (unsigned int index = GLOBAL_ID; index < NUM_ATOMS; index += GLOBAL_SIZE) {
// Get summed Born data // Get summed Born data
#ifdef SUPPORTS_64_BIT_ATOMICS
real sum = RECIP((real) 0x100000000)*bornSum[index]; real sum = RECIP((real) 0x100000000)*bornSum[index];
#else
real sum = bornSum[index];
int totalSize = bufferSize*numBuffers;
for (int i = index+bufferSize; i < totalSize; i += bufferSize)
sum += bornSum[i];
#endif
// Now calculate Born radius and OBC term. // Now calculate Born radius and OBC term.
...@@ -45,24 +34,14 @@ KERNEL void reduceBornSum(float alpha, float beta, float gamma, ...@@ -45,24 +34,14 @@ KERNEL void reduceBornSum(float alpha, float beta, float gamma,
*/ */
KERNEL void reduceBornForce( KERNEL void reduceBornForce(
#ifdef SUPPORTS_64_BIT_ATOMICS
GLOBAL mm_long* RESTRICT bornForce, GLOBAL mm_long* RESTRICT bornForce,
#else
GLOBAL real* bornForce, int bufferSize, int numBuffers,
#endif
GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const float2* RESTRICT params, GLOBAL const real* RESTRICT bornRadii, GLOBAL const real* RESTRICT obcChain) { GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const float2* RESTRICT params, GLOBAL const real* RESTRICT bornRadii, GLOBAL const real* RESTRICT obcChain) {
mixed energy = 0; mixed energy = 0;
for (unsigned int index = GLOBAL_ID; index < NUM_ATOMS; index += GLOBAL_SIZE) { for (unsigned int index = GLOBAL_ID; index < NUM_ATOMS; index += GLOBAL_SIZE) {
// Get summed Born force // Get summed Born force
#ifdef SUPPORTS_64_BIT_ATOMICS
real force = RECIP((real) 0x100000000)*bornForce[index]; real force = RECIP((real) 0x100000000)*bornForce[index];
#else
real force = bornForce[index];
int totalSize = bufferSize*numBuffers;
for (int i = index+bufferSize; i < totalSize; i += bufferSize)
force += bornForce[i];
#endif
// Now calculate the actual force // Now calculate the actual force
float offsetRadius = params[index].x; float offsetRadius = params[index].x;
...@@ -73,11 +52,7 @@ KERNEL void reduceBornForce( ...@@ -73,11 +52,7 @@ KERNEL void reduceBornForce(
force += saTerm/bornRadius; force += saTerm/bornRadius;
energy += saTerm; energy += saTerm;
force *= bornRadius*bornRadius*obcChain[index]; force *= bornRadius*bornRadius*obcChain[index];
#ifdef SUPPORTS_64_BIT_ATOMICS
bornForce[index] = realToFixedPoint(force); bornForce[index] = realToFixedPoint(force);
#else
bornForce[index] = force;
#endif
} }
energyBuffer[GLOBAL_ID] += energy/-6; energyBuffer[GLOBAL_ID] += energy/-6;
} }
...@@ -9,11 +9,7 @@ typedef struct { ...@@ -9,11 +9,7 @@ typedef struct {
* Compute the Born sum. * Compute the Born sum.
*/ */
KERNEL void computeBornSum( KERNEL void computeBornSum(
#ifdef SUPPORTS_64_BIT_ATOMICS
GLOBAL mm_long* RESTRICT global_bornSum, GLOBAL mm_long* RESTRICT global_bornSum,
#else
GLOBAL real* RESTRICT global_bornSum,
#endif
GLOBAL const real4* RESTRICT posq, GLOBAL const real* RESTRICT charge, GLOBAL const float2* RESTRICT global_params, GLOBAL const real4* RESTRICT posq, GLOBAL const real* RESTRICT charge, GLOBAL const float2* RESTRICT global_params,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
...@@ -87,12 +83,7 @@ KERNEL void computeBornSum( ...@@ -87,12 +83,7 @@ KERNEL void computeBornSum(
// Write results. // Write results.
#ifdef SUPPORTS_64_BIT_ATOMICS
ATOMIC_ADD(&global_bornSum[atom1], realToFixedPoint(bornSum)); ATOMIC_ADD(&global_bornSum[atom1], realToFixedPoint(bornSum));
#else
unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
global_bornSum[offset] += bornSum;
#endif
} }
} }
else { else {
...@@ -149,24 +140,14 @@ KERNEL void computeBornSum( ...@@ -149,24 +140,14 @@ KERNEL void computeBornSum(
// Write results for atom1. // Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS
ATOMIC_ADD(&global_bornSum[atom1], realToFixedPoint(bornSum)); ATOMIC_ADD(&global_bornSum[atom1], realToFixedPoint(bornSum));
#else
unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
global_bornSum[offset] += bornSum;
#endif
} }
// Write results. // Write results.
for (int tgx = 0; tgx < TILE_SIZE; tgx++) { for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = y*TILE_SIZE + tgx; unsigned int offset = y*TILE_SIZE + tgx;
ATOMIC_ADD(&global_bornSum[offset], realToFixedPoint(localData[tgx].bornSum)); ATOMIC_ADD(&global_bornSum[offset], realToFixedPoint(localData[tgx].bornSum));
#else
unsigned int offset = y*TILE_SIZE+tgx + GROUP_ID*PADDED_NUM_ATOMS;
global_bornSum[offset] += localData[tgx].bornSum;
#endif
} }
} }
} }
...@@ -296,12 +277,7 @@ KERNEL void computeBornSum( ...@@ -296,12 +277,7 @@ KERNEL void computeBornSum(
// Write results for atom1. // Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS
ATOMIC_ADD(&global_bornSum[atom1], realToFixedPoint(bornSum)); ATOMIC_ADD(&global_bornSum[atom1], realToFixedPoint(bornSum));
#else
unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
global_bornSum[offset] += bornSum;
#endif
} }
} }
else else
...@@ -359,12 +335,7 @@ KERNEL void computeBornSum( ...@@ -359,12 +335,7 @@ KERNEL void computeBornSum(
// Write results for atom1. // Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS
ATOMIC_ADD(&global_bornSum[atom1], realToFixedPoint(bornSum)); ATOMIC_ADD(&global_bornSum[atom1], realToFixedPoint(bornSum));
#else
unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
global_bornSum[offset] += bornSum;
#endif
} }
} }
...@@ -377,12 +348,7 @@ KERNEL void computeBornSum( ...@@ -377,12 +348,7 @@ KERNEL void computeBornSum(
unsigned int atom2 = y*TILE_SIZE + tgx; unsigned int atom2 = y*TILE_SIZE + tgx;
#endif #endif
if (atom2 < PADDED_NUM_ATOMS) { if (atom2 < PADDED_NUM_ATOMS) {
#ifdef SUPPORTS_64_BIT_ATOMICS
ATOMIC_ADD(&global_bornSum[atom2], realToFixedPoint(localData[tgx].bornSum)); ATOMIC_ADD(&global_bornSum[atom2], realToFixedPoint(localData[tgx].bornSum));
#else
unsigned int offset = atom2 + GROUP_ID*PADDED_NUM_ATOMS;
global_bornSum[offset] += localData[tgx].bornSum;
#endif
} }
} }
} }
...@@ -402,11 +368,7 @@ typedef struct { ...@@ -402,11 +368,7 @@ typedef struct {
*/ */
KERNEL void computeGBSAForce1( KERNEL void computeGBSAForce1(
#ifdef SUPPORTS_64_BIT_ATOMICS
GLOBAL mm_long* RESTRICT forceBuffers, GLOBAL mm_long* RESTRICT global_bornForce, GLOBAL mm_long* RESTRICT forceBuffers, GLOBAL mm_long* RESTRICT global_bornForce,
#else
GLOBAL real4* RESTRICT forceBuffers, GLOBAL real* RESTRICT global_bornForce,
#endif
GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq, GLOBAL const real* RESTRICT charge, GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq, GLOBAL const real* RESTRICT charge,
GLOBAL const real* RESTRICT global_bornRadii, int needEnergy, GLOBAL const real* RESTRICT global_bornRadii, int needEnergy,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
...@@ -490,16 +452,10 @@ KERNEL void computeGBSAForce1( ...@@ -490,16 +452,10 @@ KERNEL void computeGBSAForce1(
// Write results. // Write results.
#ifdef SUPPORTS_64_BIT_ATOMICS
ATOMIC_ADD(&forceBuffers[atom1], realToFixedPoint(force.x)); ATOMIC_ADD(&forceBuffers[atom1], realToFixedPoint(force.x));
ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], realToFixedPoint(force.y)); ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], realToFixedPoint(force.y));
ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], realToFixedPoint(force.z)); ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], realToFixedPoint(force.z));
ATOMIC_ADD(&global_bornForce[atom1], realToFixedPoint(force.w)); ATOMIC_ADD(&global_bornForce[atom1], realToFixedPoint(force.w));
#else
unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
forceBuffers[offset] += make_real4(force.x, force.y, force.z, 0);
global_bornForce[offset] += force.w;
#endif
} }
} }
else { else {
...@@ -561,36 +517,20 @@ KERNEL void computeGBSAForce1( ...@@ -561,36 +517,20 @@ KERNEL void computeGBSAForce1(
// Write results for atom1. // Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS
ATOMIC_ADD(&forceBuffers[atom1], realToFixedPoint(force.x)); ATOMIC_ADD(&forceBuffers[atom1], realToFixedPoint(force.x));
ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], realToFixedPoint(force.y)); ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], realToFixedPoint(force.y));
ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], realToFixedPoint(force.z)); ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], realToFixedPoint(force.z));
ATOMIC_ADD(&global_bornForce[atom1], realToFixedPoint(force.w)); ATOMIC_ADD(&global_bornForce[atom1], realToFixedPoint(force.w));
#else
unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
forceBuffers[offset] += make_real4(force.x, force.y, force.z, 0);
global_bornForce[offset] += force.w;
#endif
} }
// Write results. // Write results.
for (int tgx = 0; tgx < TILE_SIZE; tgx++) { for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = y*TILE_SIZE + tgx; unsigned int offset = y*TILE_SIZE + tgx;
ATOMIC_ADD(&forceBuffers[offset], realToFixedPoint(localData[tgx].fx)); ATOMIC_ADD(&forceBuffers[offset], realToFixedPoint(localData[tgx].fx));
ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], realToFixedPoint(localData[tgx].fy)); ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], realToFixedPoint(localData[tgx].fy));
ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], realToFixedPoint(localData[tgx].fz)); ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], realToFixedPoint(localData[tgx].fz));
ATOMIC_ADD(&global_bornForce[offset], realToFixedPoint(localData[tgx].fw)); ATOMIC_ADD(&global_bornForce[offset], realToFixedPoint(localData[tgx].fw));
#else
unsigned int offset = y*TILE_SIZE+tgx + GROUP_ID*PADDED_NUM_ATOMS;
real4 f = forceBuffers[offset];
f.x += localData[tgx].fx;
f.y += localData[tgx].fy;
f.z += localData[tgx].fz;
forceBuffers[offset] = f;
global_bornForce[offset] += localData[tgx].fw;
#endif
} }
} }
} }
...@@ -722,16 +662,10 @@ KERNEL void computeGBSAForce1( ...@@ -722,16 +662,10 @@ KERNEL void computeGBSAForce1(
// Write results for atom1. // Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS
ATOMIC_ADD(&forceBuffers[atom1], realToFixedPoint(force.x)); ATOMIC_ADD(&forceBuffers[atom1], realToFixedPoint(force.x));
ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], realToFixedPoint(force.y)); ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], realToFixedPoint(force.y));
ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], realToFixedPoint(force.z)); ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], realToFixedPoint(force.z));
ATOMIC_ADD(&global_bornForce[atom1], realToFixedPoint(force.w)); ATOMIC_ADD(&global_bornForce[atom1], realToFixedPoint(force.w));
#else
unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
forceBuffers[offset] += make_real4(force.x, force.y, force.z, 0);
global_bornForce[offset] += force.w;
#endif
} }
} }
else else
...@@ -790,16 +724,10 @@ KERNEL void computeGBSAForce1( ...@@ -790,16 +724,10 @@ KERNEL void computeGBSAForce1(
// Write results for atom1. // Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS
ATOMIC_ADD(&forceBuffers[atom1], realToFixedPoint(force.x)); ATOMIC_ADD(&forceBuffers[atom1], realToFixedPoint(force.x));
ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], realToFixedPoint(force.y)); ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], realToFixedPoint(force.y));
ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], realToFixedPoint(force.z)); ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], realToFixedPoint(force.z));
ATOMIC_ADD(&global_bornForce[atom1], realToFixedPoint(force.w)); ATOMIC_ADD(&global_bornForce[atom1], realToFixedPoint(force.w));
#else
unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
forceBuffers[offset] += make_real4(force.x, force.y, force.z, 0);
global_bornForce[offset] += force.w;
#endif
} }
} }
...@@ -812,20 +740,10 @@ KERNEL void computeGBSAForce1( ...@@ -812,20 +740,10 @@ KERNEL void computeGBSAForce1(
unsigned int atom2 = y*TILE_SIZE + tgx; unsigned int atom2 = y*TILE_SIZE + tgx;
#endif #endif
if (atom2 < PADDED_NUM_ATOMS) { if (atom2 < PADDED_NUM_ATOMS) {
#ifdef SUPPORTS_64_BIT_ATOMICS
ATOMIC_ADD(&forceBuffers[atom2], realToFixedPoint(localData[tgx].fx)); ATOMIC_ADD(&forceBuffers[atom2], realToFixedPoint(localData[tgx].fx));
ATOMIC_ADD(&forceBuffers[atom2+PADDED_NUM_ATOMS], realToFixedPoint(localData[tgx].fy)); ATOMIC_ADD(&forceBuffers[atom2+PADDED_NUM_ATOMS], realToFixedPoint(localData[tgx].fy));
ATOMIC_ADD(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], realToFixedPoint(localData[tgx].fz)); ATOMIC_ADD(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], realToFixedPoint(localData[tgx].fz));
ATOMIC_ADD(&global_bornForce[atom2], realToFixedPoint(localData[tgx].fw)); ATOMIC_ADD(&global_bornForce[atom2], realToFixedPoint(localData[tgx].fw));
#else
unsigned int offset = atom2 + GROUP_ID*PADDED_NUM_ATOMS;
real4 f = forceBuffers[offset];
f.x += localData[tgx].fx;
f.y += localData[tgx].fy;
f.z += localData[tgx].fz;
forceBuffers[offset] = f;
global_bornForce[offset] += localData[tgx].fw;
#endif
} }
} }
} }
......
KERNEL void findAtomGridIndex(GLOBAL const real4* RESTRICT posq, GLOBAL int2* RESTRICT pmeAtomGridIndex, KERNEL void findAtomGridIndex(GLOBAL const real4* RESTRICT posq, GLOBAL int2* RESTRICT pmeAtomGridIndex,
real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ
#ifndef SUPPORTS_64_BIT_ATOMICS
, GLOBAL real4* RESTRICT pmeBsplineTheta, LOCAL real4* RESTRICT bsplinesCache,
#ifdef CHARGE_FROM_SIGEPS
GLOBAL const float2* RESTRICT sigmaEpsilon
#else
GLOBAL const real* RESTRICT charges
#endif
#endif
) { ) {
// Compute the index of the grid point each atom is associated with. // Compute the index of the grid point each atom is associated with.
...@@ -25,42 +17,9 @@ KERNEL void findAtomGridIndex(GLOBAL const real4* RESTRICT posq, GLOBAL int2* RE ...@@ -25,42 +17,9 @@ KERNEL void findAtomGridIndex(GLOBAL const real4* RESTRICT posq, GLOBAL int2* RE
((int) t.y) % GRID_SIZE_Y, ((int) t.y) % GRID_SIZE_Y,
((int) t.z) % GRID_SIZE_Z); ((int) t.z) % GRID_SIZE_Z);
pmeAtomGridIndex[atom] = make_int2(atom, gridIndex.x*GRID_SIZE_Y*GRID_SIZE_Z+gridIndex.y*GRID_SIZE_Z+gridIndex.z); pmeAtomGridIndex[atom] = make_int2(atom, gridIndex.x*GRID_SIZE_Y*GRID_SIZE_Z+gridIndex.y*GRID_SIZE_Z+gridIndex.z);
#ifndef SUPPORTS_64_BIT_ATOMICS
// Compute B-splines here for use in the charge spreading kernel.
const real4 scale = 1/(real) (PME_ORDER-1);
LOCAL real4* data = &bsplinesCache[LOCAL_ID*PME_ORDER];
real4 dr = (real4) (t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z, 0.0f);
data[PME_ORDER-1] = 0.0f;
data[1] = dr;
data[0] = 1.0f-dr;
for (int j = 3; j < PME_ORDER; j++) {
real div = RECIP(j-1.0f);
data[j-1] = div*dr*data[j-2];
for (int k = 1; k < (j-1); k++)
data[j-k-1] = div*((dr+make_real4(k))*data[j-k-2] + (-dr+make_real4(j-k))*data[j-k-1]);
data[0] = div*(- dr+1.0f)*data[0];
}
data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
for (int j = 1; j < (PME_ORDER-1); j++)
data[PME_ORDER-j-1] = scale*((dr+make_real4(j))*data[PME_ORDER-j-2] + (-dr+make_real4(PME_ORDER-j))*data[PME_ORDER-j-1]);
data[0] = scale*(-dr+1.0f)*data[0];
for (int j = 0; j < PME_ORDER; j++) {
#ifdef CHARGE_FROM_SIGEPS
const float2 sigEps = sigmaEpsilon[atom];
const real charge = 8*sigEps.x*sigEps.x*sigEps.x*sigEps.y;
#else
const real charge = CHARGE;
#endif
data[j].w = charge; // Storing the charge here improves cache coherency in the charge spreading kernel
pmeBsplineTheta[atom+j*NUM_ATOMS] = data[j];
}
#endif
} }
} }
#ifdef SUPPORTS_64_BIT_ATOMICS
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#if defined(USE_HIP) && !defined(AMD_RDNA) #if defined(USE_HIP) && !defined(AMD_RDNA)
LAUNCH_BOUNDS_EXACT(128, 1) LAUNCH_BOUNDS_EXACT(128, 1)
#endif #endif
...@@ -206,197 +165,6 @@ KERNEL void finishSpreadCharge( ...@@ -206,197 +165,6 @@ KERNEL void finishSpreadCharge(
#endif #endif
} }
} }
#elif defined(DEVICE_IS_CPU)
KERNEL void gridSpreadCharge(GLOBAL const real4* RESTRICT posq, GLOBAL real* RESTRICT pmeGrid,
real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ,
#ifdef CHARGE_FROM_SIGEPS
GLOBAL const float2* RESTRICT sigmaEpsilon
#else
GLOBAL const real* RESTRICT charges
#endif
) {
const int firstx = GLOBAL_ID*GRID_SIZE_X/GLOBAL_SIZE;
const int lastx = (GLOBAL_ID+1)*GRID_SIZE_X/GLOBAL_SIZE;
if (firstx == lastx)
return;
const real4 scale = 1/(real) (PME_ORDER-1);
real4 data[PME_ORDER];
// Process the atoms in spatially sorted order. This improves efficiency when writing
// the grid values.
for (int i = 0; i < NUM_ATOMS; i++) {
int atom = i;
real4 pos = posq[atom];
APPLY_PERIODIC_TO_POS(pos)
real3 t = (real3) (pos.x*recipBoxVecX.x+pos.y*recipBoxVecY.x+pos.z*recipBoxVecZ.x,
pos.y*recipBoxVecY.y+pos.z*recipBoxVecZ.y,
pos.z*recipBoxVecZ.z);
t.x = (t.x-floor(t.x))*GRID_SIZE_X;
t.y = (t.y-floor(t.y))*GRID_SIZE_Y;
t.z = (t.z-floor(t.z))*GRID_SIZE_Z;
int4 gridIndex = (int4) (((int) t.x) % GRID_SIZE_X,
((int) t.y) % GRID_SIZE_Y,
((int) t.z) % GRID_SIZE_Z, 0);
// Spread the charge from this atom onto each grid point.
#ifdef CHARGE_FROM_SIGEPS
const float2 sigEps = sigmaEpsilon[atom];
const real charge = 8*sigEps.x*sigEps.x*sigEps.x*sigEps.y;
#else
const real charge = (CHARGE)*EPSILON_FACTOR;
#endif
if (charge == 0)
continue;
bool hasComputedThetas = false;
for (int ix = 0; ix < PME_ORDER; ix++) {
int xindex = gridIndex.x+ix;
xindex -= (xindex >= GRID_SIZE_X ? GRID_SIZE_X : 0);
if (xindex < firstx || xindex >= lastx)
continue;
if (!hasComputedThetas) {
hasComputedThetas = true;
// Since we need the full set of thetas, it's faster to compute them here than load them
// from global memory.
real4 dr = (real4) (t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z, 0.0f);
data[PME_ORDER-1] = 0.0f;
data[1] = dr;
data[0] = 1.0f-dr;
for (int j = 3; j < PME_ORDER; j++) {
real div = RECIP(j-1.0f);
data[j-1] = div*dr*data[j-2];
for (int k = 1; k < (j-1); k++)
data[j-k-1] = div*((dr+(real4) k) *data[j-k-2] + (-dr+(real4) (j-k))*data[j-k-1]);
data[0] = div*(- dr+1.0f)*data[0];
}
data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
for (int j = 1; j < (PME_ORDER-1); j++)
data[PME_ORDER-j-1] = scale*((dr+(real4) j)*data[PME_ORDER-j-2] + (-dr+(real4) (PME_ORDER-j))*data[PME_ORDER-j-1]);
data[0] = scale*(-dr+1.0f)*data[0];
}
for (int iy = 0; iy < PME_ORDER; iy++) {
int yindex = gridIndex.y+iy;
yindex -= (yindex >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
for (int iz = 0; iz < PME_ORDER; iz++) {
int zindex = gridIndex.z+iz;
zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
pmeGrid[index] += charge*data[ix].x*data[iy].y*data[iz].z;
}
}
}
}
}
#else
/**
* For each grid point, find the range of sorted atoms associated with that point.
*/
KERNEL void findAtomRangeForGrid(GLOBAL int2* RESTRICT pmeAtomGridIndex, GLOBAL int* RESTRICT pmeAtomRange, GLOBAL const real4* RESTRICT posq) {
int start = (NUM_ATOMS*GLOBAL_ID)/GLOBAL_SIZE;
int end = (NUM_ATOMS*(GLOBAL_ID+1))/GLOBAL_SIZE;
int last = (start == 0 ? -1 : pmeAtomGridIndex[start-1].y);
for (int i = start; i < end; ++i) {
int2 atomData = pmeAtomGridIndex[i];
int gridIndex = atomData.y;
if (gridIndex != last) {
for (int j = last+1; j <= gridIndex; ++j)
pmeAtomRange[j] = i;
last = gridIndex;
}
}
// Fill in values beyond the last atom.
if (GLOBAL_ID == GLOBAL_SIZE-1) {
int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
for (int j = last+1; j <= gridSize; ++j)
pmeAtomRange[j] = NUM_ATOMS;
}
}
/**
* The grid index won't be needed again. Reuse that component to hold the z index, thus saving
* some work in the charge spreading kernel.
*/
KERNEL void recordZIndex(GLOBAL int2* RESTRICT pmeAtomGridIndex, GLOBAL const real4* RESTRICT posq, real4 periodicBoxSize, real4 recipBoxVecZ) {
int start = (NUM_ATOMS*GLOBAL_ID)/GLOBAL_SIZE;
int end = (NUM_ATOMS*(GLOBAL_ID+1))/GLOBAL_SIZE;
for (int i = start; i < end; ++i) {
real posz = posq[pmeAtomGridIndex[i].x].z;
posz -= floor(posz*recipBoxVecZ.z)*periodicBoxSize.z;
int z = ((int) ((posz*recipBoxVecZ.z)*GRID_SIZE_Z)) % GRID_SIZE_Z;
pmeAtomGridIndex[i].y = z;
}
}
KERNEL void gridSpreadCharge(GLOBAL const real4* RESTRICT posq, GLOBAL real* RESTRICT pmeGrid,
GLOBAL const int2* RESTRICT pmeAtomGridIndex, GLOBAL const int* RESTRICT pmeAtomRange,
GLOBAL const real4* RESTRICT pmeBsplineTheta
#ifdef CHARGE_FROM_SIGEPS
, GLOBAL const float2* RESTRICT sigmaEpsilon
#else
, GLOBAL const real* RESTRICT charges
#endif
) {
unsigned int numGridPoints = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
for (int gridIndex = GLOBAL_ID; gridIndex < numGridPoints; gridIndex += GLOBAL_SIZE) {
// Compute the charge on a grid point.
int4 gridPoint;
gridPoint.x = gridIndex/(GRID_SIZE_Y*GRID_SIZE_Z);
int remainder = gridIndex-gridPoint.x*GRID_SIZE_Y*GRID_SIZE_Z;
gridPoint.y = remainder/GRID_SIZE_Z;
gridPoint.z = remainder-gridPoint.y*GRID_SIZE_Z;
real result = 0.0f;
// Loop over all atoms that affect this grid point.
for (int ix = 0; ix < PME_ORDER; ++ix) {
int x = gridPoint.x-ix+(gridPoint.x >= ix ? 0 : GRID_SIZE_X);
for (int iy = 0; iy < PME_ORDER; ++iy) {
int y = gridPoint.y-iy+(gridPoint.y >= iy ? 0 : GRID_SIZE_Y);
int z1 = gridPoint.z-PME_ORDER+1;
z1 += (z1 >= 0 ? 0 : GRID_SIZE_Z);
int z2 = (z1 < gridPoint.z ? gridPoint.z : GRID_SIZE_Z-1);
int gridIndex1 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z1;
int gridIndex2 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z2;
int firstAtom = pmeAtomRange[gridIndex1];
int lastAtom = pmeAtomRange[gridIndex2+1];
for (int i = firstAtom; i < lastAtom; ++i)
{
int2 atomData = pmeAtomGridIndex[i];
int atomIndex = atomData.x;
int z = atomData.y;
int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : GRID_SIZE_Z);
real atomCharge = pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].w;
result += atomCharge*pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].x*pmeBsplineTheta[atomIndex+iy*NUM_ATOMS].y*pmeBsplineTheta[atomIndex+iz*NUM_ATOMS].z;
}
if (z1 > gridPoint.z)
{
gridIndex1 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z;
gridIndex2 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+gridPoint.z;
firstAtom = pmeAtomRange[gridIndex1];
lastAtom = pmeAtomRange[gridIndex2+1];
for (int i = firstAtom; i < lastAtom; ++i)
{
int2 atomData = pmeAtomGridIndex[i];
int atomIndex = atomData.x;
int z = atomData.y;
int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : GRID_SIZE_Z);
real atomCharge = pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].w;
result += atomCharge*pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].x*pmeBsplineTheta[atomIndex+iy*NUM_ATOMS].y*pmeBsplineTheta[atomIndex+iz*NUM_ATOMS].z;
}
}
}
}
pmeGrid[gridIndex] = result*EPSILON_FACTOR;
}
}
#endif
KERNEL void reciprocalConvolution(GLOBAL real2* RESTRICT pmeGrid, GLOBAL const real* RESTRICT pmeBsplineModuliX, KERNEL void reciprocalConvolution(GLOBAL real2* RESTRICT pmeGrid, GLOBAL const real* RESTRICT pmeBsplineModuliX,
GLOBAL const real* RESTRICT pmeBsplineModuliY, GLOBAL const real* RESTRICT pmeBsplineModuliZ, GLOBAL const real* RESTRICT pmeBsplineModuliY, GLOBAL const real* RESTRICT pmeBsplineModuliZ,
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2011-2019 Stanford University and the Authors. * * Portions copyright (c) 2011-2022 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -133,12 +133,6 @@ public: ...@@ -133,12 +133,6 @@ public:
* Initialize this object in preparation for a simulation. * Initialize this object in preparation for a simulation.
*/ */
void initialize(const System& system); void initialize(const System& system);
/**
* Get the number of force buffers required for bonded forces.
*/
int getNumForceBuffers() {
return numForceBuffers;
}
/** /**
* Compute the bonded interactions. * Compute the bonded interactions.
* *
...@@ -148,19 +142,17 @@ public: ...@@ -148,19 +142,17 @@ public:
private: private:
std::string createForceSource(int forceIndex, int numBonds, int numAtoms, int group, const std::string& computeForce); std::string createForceSource(int forceIndex, int numBonds, int numAtoms, int group, const std::string& computeForce);
OpenCLContext& context; OpenCLContext& context;
std::vector<cl::Kernel> kernels; cl::Kernel kernel;
std::vector<std::vector<std::vector<int> > > forceAtoms; std::vector<std::vector<std::vector<int> > > forceAtoms;
std::vector<int> indexWidth; std::vector<int> indexWidth;
std::vector<std::string> forceSource; std::vector<std::string> forceSource;
std::vector<int> forceGroup; std::vector<int> forceGroup;
std::vector<std::vector<int> > forceSets;
std::vector<cl::Memory*> arguments; std::vector<cl::Memory*> arguments;
std::vector<std::string> argTypes; std::vector<std::string> argTypes;
std::vector<OpenCLArray> atomIndices; std::vector<OpenCLArray> atomIndices;
std::vector<OpenCLArray> bufferIndices;
std::vector<std::string> prefixCode; std::vector<std::string> prefixCode;
std::vector<std::string> energyParameterDerivatives; std::vector<std::string> energyParameterDerivatives;
int numForceBuffers, maxBonds, allGroups; int maxBonds, allGroups;
bool hasInitializedKernels; bool hasInitializedKernels;
}; };
......
...@@ -125,7 +125,7 @@ public: ...@@ -125,7 +125,7 @@ public:
* Get the number of force buffers required for nonbonded forces. * Get the number of force buffers required for nonbonded forces.
*/ */
int getNumForceBuffers() const { int getNumForceBuffers() const {
return numForceBuffers; return 1;
} }
/** /**
* Get the number of energy buffers required for nonbonded forces. * Get the number of energy buffers required for nonbonded forces.
...@@ -331,7 +331,7 @@ private: ...@@ -331,7 +331,7 @@ private:
std::map<int, std::string> groupKernelSource; std::map<int, std::string> groupKernelSource;
double lastCutoff; double lastCutoff;
bool useCutoff, usePeriodic, deviceIsCpu, anyExclusions, usePadding, forceRebuildNeighborList; bool useCutoff, usePeriodic, deviceIsCpu, anyExclusions, usePadding, forceRebuildNeighborList;
int numForceBuffers, startTileIndex, startBlockIndex, numBlocks, maxExclusions, numForceThreadBlocks; int startTileIndex, startBlockIndex, numBlocks, maxExclusions, numForceThreadBlocks;
int forceThreadBlockSize, interactingBlocksThreadBlockSize, groupFlags; int forceThreadBlockSize, interactingBlocksThreadBlockSize, groupFlags;
unsigned int tilesAfterReorder; unsigned int tilesAfterReorder;
long long numTiles; long long numTiles;
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2011-2019 Stanford University and the Authors. * * Portions copyright (c) 2011-2022 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -34,7 +34,7 @@ ...@@ -34,7 +34,7 @@
using namespace OpenMM; using namespace OpenMM;
using namespace std; using namespace std;
OpenCLBondedUtilities::OpenCLBondedUtilities(OpenCLContext& context) : context(context), numForceBuffers(0), maxBonds(0), allGroups(0), hasInitializedKernels(false) { OpenCLBondedUtilities::OpenCLBondedUtilities(OpenCLContext& context) : context(context), maxBonds(0), allGroups(0), hasInitializedKernels(false) {
} }
void OpenCLBondedUtilities::addInteraction(const vector<vector<int> >& atoms, const string& source, int group) { void OpenCLBondedUtilities::addInteraction(const vector<vector<int> >& atoms, const string& source, int group) {
...@@ -85,11 +85,8 @@ void OpenCLBondedUtilities::initialize(const System& system) { ...@@ -85,11 +85,8 @@ void OpenCLBondedUtilities::initialize(const System& system) {
if (numForces == 0) if (numForces == 0)
return; return;
// Build the lists of atom indices and buffer indices. // Build the lists of atom indices.
vector<vector<cl_uint> > bufferVec(numForces);
vector<vector<int> > bufferCounter(numForces, vector<int>(system.getNumParticles(), 0));
vector<int> numBuffers(numForces, 0);
atomIndices.resize(numForces); atomIndices.resize(numForces);
for (int i = 0; i < numForces; i++) { for (int i = 0; i < numForces; i++) {
int numBonds = forceAtoms[i].size(); int numBonds = forceAtoms[i].size();
...@@ -102,100 +99,17 @@ void OpenCLBondedUtilities::initialize(const System& system) { ...@@ -102,100 +99,17 @@ void OpenCLBondedUtilities::initialize(const System& system) {
} }
atomIndices[i].initialize<cl_uint>(context, indexVec.size(), "bondedIndices"); atomIndices[i].initialize<cl_uint>(context, indexVec.size(), "bondedIndices");
atomIndices[i].upload(indexVec); atomIndices[i].upload(indexVec);
bufferVec[i].resize(width*numBonds, 0);
for (int bond = 0; bond < numBonds; bond++) {
for (int atom = 0; atom < numAtoms; atom++)
bufferVec[i][bond*width+atom] = bufferCounter[i][forceAtoms[i][bond][atom]]++;
}
for (int j = 0; j < (int) bufferCounter[i].size(); j++)
numBuffers[i] = max(numBuffers[i], bufferCounter[i][j]);
}
// For efficiency, we want to merge multiple forces into a single kernel - but only if that
// won't increase the number of force buffers.
if (context.getSupports64BitGlobalAtomics()) {
// Put all the forces in the same set.
numForceBuffers = 1;
forceSets.push_back(vector<int>());
for (int i = 0; i < numForces; i++)
forceSets[0].push_back(i);
}
else {
// Figure out how many force buffers will be required.
for (int i = 0; i < numForces; i++)
numForceBuffers = max(numForceBuffers, numBuffers[i]);
int bufferLimit = max(numForceBuffers, (int) context.getPlatformData().contexts.size());
if (context.getNonbondedUtilities().getHasInteractions())
bufferLimit = max(bufferLimit, context.getNonbondedUtilities().getNumForceBuffers());
// Figure out sets of forces that can be merged.
vector<int> unmerged(numForces);
for (int i = 0; i < numForces; i++)
unmerged[i] = i;
for (int i = 0; i < numForces; i++)
for (int j = i-1; j >= 0; j--) {
if (numBuffers[unmerged[j]] <= numBuffers[unmerged[j+1]])
break;
int temp = unmerged[j+1];
unmerged[j+1] = unmerged[j];
unmerged[j] = temp;
}
while (unmerged.size() > 0) {
int sum = numBuffers[unmerged.back()];
int i;
for (i = 0; i < (int) unmerged.size()-1; i++) {
if (sum+numBuffers[unmerged[i]] > bufferLimit)
break;
sum += numBuffers[unmerged[i]];
}
forceSets.push_back(vector<int>());
for (int j = 0; j < i; j++)
forceSets.back().push_back(unmerged[j]);
forceSets.back().push_back(unmerged.back());
for (int j = 0; j < i; j++)
unmerged.erase(unmerged.begin());
unmerged.pop_back();
}
}
// Update the buffer indices based on merged sets.
bufferIndices.resize(numForces);
for (int i = 0; i < (int) forceSets.size(); i++)
for (int j = 0; j < (int) forceSets[i].size(); j++) {
int force = forceSets[i][j];
int numBonds = forceAtoms[force].size();
int numAtoms = forceAtoms[force][0].size();
int width = indexWidth[force];
for (int k = 0; k < j; k++)
for (int bond = 0; bond < numBonds; bond++)
for (int atom = 0; atom < numAtoms; atom++)
bufferVec[force][bond*width+atom] += bufferCounter[forceSets[i][k]][forceAtoms[force][bond][atom]];
bufferIndices[force].initialize<cl_uint>(context, bufferVec[force].size(), "bondedBufferIndices");
bufferIndices[force].upload(bufferVec[force]);
} }
// Create the kernels. // Create the kernel.
for (auto& set : forceSets) {
int setSize = set.size();
stringstream s; stringstream s;
s<<"#ifdef SUPPORTS_64_BIT_ATOMICS\n";
s<<"#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n";
s<<"#endif\n";
for (int i = 0; i < (int) prefixCode.size(); i++) for (int i = 0; i < (int) prefixCode.size(); i++)
s<<prefixCode[i]; s<<prefixCode[i];
string bufferType = (context.getSupports64BitGlobalAtomics() ? "long" : "real4"); s<<"__kernel void computeBondedForces(__global long* restrict forceBuffers, __global mixed* restrict energyBuffer, __global const real4* restrict posq, int groups, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ";
s<<"__kernel void computeBondedForces(__global "<<bufferType<<"* restrict forceBuffers, __global mixed* restrict energyBuffer, __global const real4* restrict posq, int groups, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ"; for (int force = 0; force < numForces; force++) {
for (int i = 0; i < setSize; i++) {
int force = set[i];
string indexType = "uint"+(indexWidth[force] == 1 ? "" : context.intToString(indexWidth[force])); string indexType = "uint"+(indexWidth[force] == 1 ? "" : context.intToString(indexWidth[force]));
s<<", __global const "<<indexType<<"* restrict atomIndices"<<i; s<<", __global const "<<indexType<<"* restrict atomIndices"<<force;
s<<", __global const "<<indexType<<"* restrict bufferIndices"<<i;
} }
for (int i = 0; i < (int) arguments.size(); i++) for (int i = 0; i < (int) arguments.size(); i++)
s<<", __global "<<argTypes[i]<<"* customArg"<<(i+1); s<<", __global "<<argTypes[i]<<"* customArg"<<(i+1);
...@@ -205,10 +119,8 @@ void OpenCLBondedUtilities::initialize(const System& system) { ...@@ -205,10 +119,8 @@ void OpenCLBondedUtilities::initialize(const System& system) {
s<<"mixed energy = 0;\n"; s<<"mixed energy = 0;\n";
for (int i = 0; i < energyParameterDerivatives.size(); i++) for (int i = 0; i < energyParameterDerivatives.size(); i++)
s<<"mixed energyParamDeriv"<<i<<" = 0;\n"; s<<"mixed energyParamDeriv"<<i<<" = 0;\n";
for (int i = 0; i < setSize; i++) { for (int force = 0; force < numForces; force++)
int force = set[i]; s<<createForceSource(force, forceAtoms[force].size(), forceAtoms[force][0].size(), forceGroup[force], forceSource[force]);
s<<createForceSource(i, forceAtoms[force].size(), forceAtoms[force][0].size(), forceGroup[force], forceSource[force]);
}
s<<"energyBuffer[get_global_id(0)] += energy;\n"; s<<"energyBuffer[get_global_id(0)] += energy;\n";
const vector<string>& allParamDerivNames = context.getEnergyParamDerivNames(); const vector<string>& allParamDerivNames = context.getEnergyParamDerivNames();
int numDerivs = allParamDerivNames.size(); int numDerivs = allParamDerivNames.size();
...@@ -220,8 +132,7 @@ void OpenCLBondedUtilities::initialize(const System& system) { ...@@ -220,8 +132,7 @@ void OpenCLBondedUtilities::initialize(const System& system) {
map<string, string> defines; map<string, string> defines;
defines["PADDED_NUM_ATOMS"] = context.intToString(context.getPaddedNumAtoms()); defines["PADDED_NUM_ATOMS"] = context.intToString(context.getPaddedNumAtoms());
cl::Program program = context.createProgram(s.str(), defines); cl::Program program = context.createProgram(s.str(), defines);
kernels.push_back(cl::Kernel(program, "computeBondedForces")); kernel = cl::Kernel(program, "computeBondedForces");
}
forceAtoms.clear(); forceAtoms.clear();
forceSource.clear(); forceSource.clear();
} }
...@@ -247,7 +158,6 @@ string OpenCLBondedUtilities::createForceSource(int forceIndex, int numBonds, in ...@@ -247,7 +158,6 @@ string OpenCLBondedUtilities::createForceSource(int forceIndex, int numBonds, in
s<<"if ((groups&"<<(1<<group)<<") != 0)\n"; s<<"if ((groups&"<<(1<<group)<<") != 0)\n";
s<<"for (unsigned int index = get_global_id(0); index < "<<numBonds<<"; index += get_global_size(0)) {\n"; s<<"for (unsigned int index = get_global_id(0); index < "<<numBonds<<"; index += get_global_size(0)) {\n";
s<<" "<<indexType<<" atoms = atomIndices"<<forceIndex<<"[index];\n"; s<<" "<<indexType<<" atoms = atomIndices"<<forceIndex<<"[index];\n";
s<<" "<<indexType<<" buffers = bufferIndices"<<forceIndex<<"[index];\n";
for (int i = 0; i < numAtoms; i++) { for (int i = 0; i < numAtoms; i++) {
s<<" unsigned int atom"<<(i+1)<<" = atoms"<<suffix[i]<<";\n"; s<<" unsigned int atom"<<(i+1)<<" = atoms"<<suffix[i]<<";\n";
s<<" real4 pos"<<(i+1)<<" = posq[atom"<<(i+1)<<"];\n"; s<<" real4 pos"<<(i+1)<<" = posq[atom"<<(i+1)<<"];\n";
...@@ -255,17 +165,9 @@ string OpenCLBondedUtilities::createForceSource(int forceIndex, int numBonds, in ...@@ -255,17 +165,9 @@ string OpenCLBondedUtilities::createForceSource(int forceIndex, int numBonds, in
s<<computeForce<<"\n"; s<<computeForce<<"\n";
for (int i = 0; i < numAtoms; i++) { for (int i = 0; i < numAtoms; i++) {
s<<" {\n"; s<<" {\n";
if (context.getSupports64BitGlobalAtomics()) { s<<" ATOMIC_ADD(&forceBuffers[atom"<<(i+1)<<"], (mm_ulong) realToFixedPoint(force"<<(i+1)<<".x));\n";
s<<" atom_add(&forceBuffers[atom"<<(i+1)<<"], realToFixedPoint(force"<<(i+1)<<".x));\n"; s<<" ATOMIC_ADD(&forceBuffers[atom"<<(i+1)<<"+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force"<<(i+1)<<".y));\n";
s<<" atom_add(&forceBuffers[atom"<<(i+1)<<"+PADDED_NUM_ATOMS], realToFixedPoint(force"<<(i+1)<<".y));\n"; s<<" ATOMIC_ADD(&forceBuffers[atom"<<(i+1)<<"+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force"<<(i+1)<<".z));\n";
s<<" atom_add(&forceBuffers[atom"<<(i+1)<<"+2*PADDED_NUM_ATOMS], realToFixedPoint(force"<<(i+1)<<".z));\n";
}
else {
s<<" unsigned int offset = atom"<<(i+1)<<"+buffers"<<suffix[i]<<"*PADDED_NUM_ATOMS;\n";
s<<" real4 force = forceBuffers[offset];\n";
s<<" force.xyz += force"<<(i+1)<<".xyz;\n";
s<<" forceBuffers[offset] = force;\n";
}
s<<" }\n"; s<<" }\n";
} }
s<<"}\n"; s<<"}\n";
...@@ -277,28 +179,18 @@ void OpenCLBondedUtilities::computeInteractions(int groups) { ...@@ -277,28 +179,18 @@ void OpenCLBondedUtilities::computeInteractions(int groups) {
return; return;
if (!hasInitializedKernels) { if (!hasInitializedKernels) {
hasInitializedKernels = true; hasInitializedKernels = true;
for (int i = 0; i < (int) forceSets.size(); i++) {
int index = 0; int index = 0;
cl::Kernel& kernel = kernels[i];
if (context.getSupports64BitGlobalAtomics())
kernel.setArg<cl::Buffer>(index++, context.getLongForceBuffer().getDeviceBuffer()); kernel.setArg<cl::Buffer>(index++, context.getLongForceBuffer().getDeviceBuffer());
else
kernel.setArg<cl::Buffer>(index++, context.getForceBuffers().getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, context.getEnergyBuffer().getDeviceBuffer()); kernel.setArg<cl::Buffer>(index++, context.getEnergyBuffer().getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, context.getPosq().getDeviceBuffer()); kernel.setArg<cl::Buffer>(index++, context.getPosq().getDeviceBuffer());
index += 6; index += 6;
for (int j = 0; j < (int) forceSets[i].size(); j++) { for (int j = 0; j < (int) atomIndices.size(); j++)
kernel.setArg<cl::Buffer>(index++, atomIndices[forceSets[i][j]].getDeviceBuffer()); kernel.setArg<cl::Buffer>(index++, atomIndices[j].getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, bufferIndices[forceSets[i][j]].getDeviceBuffer());
}
for (int j = 0; j < (int) arguments.size(); j++) for (int j = 0; j < (int) arguments.size(); j++)
kernel.setArg<cl::Memory>(index++, *arguments[j]); kernel.setArg<cl::Memory>(index++, *arguments[j]);
if (energyParameterDerivatives.size() > 0) if (energyParameterDerivatives.size() > 0)
kernel.setArg<cl::Memory>(index++, context.getEnergyParamDerivBuffer().getDeviceBuffer()); kernel.setArg<cl::Memory>(index++, context.getEnergyParamDerivBuffer().getDeviceBuffer());
} }
}
for (int i = 0; i < (int) kernels.size(); i++) {
cl::Kernel& kernel = kernels[i];
kernel.setArg<cl_int>(3, groups); kernel.setArg<cl_int>(3, groups);
if (context.getUseDoublePrecision()) { if (context.getUseDoublePrecision()) {
kernel.setArg<mm_double4>(4, context.getPeriodicBoxSizeDouble()); kernel.setArg<mm_double4>(4, context.getPeriodicBoxSizeDouble());
...@@ -314,6 +206,5 @@ void OpenCLBondedUtilities::computeInteractions(int groups) { ...@@ -314,6 +206,5 @@ void OpenCLBondedUtilities::computeInteractions(int groups) {
kernel.setArg<mm_float4>(7, context.getPeriodicBoxVecY()); kernel.setArg<mm_float4>(7, context.getPeriodicBoxVecY());
kernel.setArg<mm_float4>(8, context.getPeriodicBoxVecZ()); kernel.setArg<mm_float4>(8, context.getPeriodicBoxVecZ());
} }
context.executeKernel(kernels[i], maxBonds); context.executeKernel(kernel, maxBonds);
}
} }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment