Commit a0651b5b authored by Peter Eastman's avatar Peter Eastman
Browse files

Eliminated more local memory bank conflicts

parent 7b3a8266
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2008-2009 Stanford University and the Authors. * * Portions copyright (c) 2008-2010 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -1216,7 +1216,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb ...@@ -1216,7 +1216,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
for (int i = 0; i < tableSize; ++i) for (int i = 0; i < tableSize; ++i)
erfcVector[i] = (float) erfc(i*(alpha*force.getCutoffDistance())/(tableSize-1)); erfcVector[i] = (float) erfc(i*(alpha*force.getCutoffDistance())/(tableSize-1));
erfcTable->upload(erfcVector); erfcTable->upload(erfcVector);
cl.getNonbondedUtilities().addArgument(OpenCLNonbondedUtilities::ParameterInfo("erfcTable", "float", sizeof(cl_float), erfcTable->getDeviceBuffer())); cl.getNonbondedUtilities().addArgument(OpenCLNonbondedUtilities::ParameterInfo("erfcTable", "float", 1, sizeof(cl_float), erfcTable->getDeviceBuffer()));
} }
// Add the interaction to the default nonbonded kernel. // Add the interaction to the default nonbonded kernel.
...@@ -1224,7 +1224,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb ...@@ -1224,7 +1224,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
string source = cl.replaceStrings(OpenCLKernelSources::coulombLennardJones, defines); string source = cl.replaceStrings(OpenCLKernelSources::coulombLennardJones, defines);
cl.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source); cl.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source);
if (hasLJ) if (hasLJ)
cl.getNonbondedUtilities().addParameter(OpenCLNonbondedUtilities::ParameterInfo("sigmaEpsilon", "float2", sizeof(cl_float2), sigmaEpsilon->getDeviceBuffer())); cl.getNonbondedUtilities().addParameter(OpenCLNonbondedUtilities::ParameterInfo("sigmaEpsilon", "float", 2, sizeof(cl_float2), sigmaEpsilon->getDeviceBuffer()));
// Initialize the exceptions. // Initialize the exceptions.
...@@ -1431,12 +1431,12 @@ void OpenCLCalcCustomNonbondedForceKernel::initialize(const System& system, cons ...@@ -1431,12 +1431,12 @@ void OpenCLCalcCustomNonbondedForceKernel::initialize(const System& system, cons
vector<mm_float4> f = OpenCLExpressionUtilities::computeFunctionCoefficients(values, interpolating); vector<mm_float4> f = OpenCLExpressionUtilities::computeFunctionCoefficients(values, interpolating);
tabulatedFunctions.push_back(new OpenCLArray<mm_float4>(cl, values.size()-1, "TabulatedFunction")); tabulatedFunctions.push_back(new OpenCLArray<mm_float4>(cl, values.size()-1, "TabulatedFunction"));
tabulatedFunctions[tabulatedFunctions.size()-1]->upload(f); tabulatedFunctions[tabulatedFunctions.size()-1]->upload(f);
cl.getNonbondedUtilities().addArgument(OpenCLNonbondedUtilities::ParameterInfo(arrayName, "float4", sizeof(cl_float4), tabulatedFunctions[tabulatedFunctions.size()-1]->getDeviceBuffer())); cl.getNonbondedUtilities().addArgument(OpenCLNonbondedUtilities::ParameterInfo(arrayName, "float", 4, sizeof(cl_float4), tabulatedFunctions[tabulatedFunctions.size()-1]->getDeviceBuffer()));
} }
if (force.getNumFunctions() > 0) { if (force.getNumFunctions() > 0) {
tabulatedFunctionParams = new OpenCLArray<mm_float4>(cl, tabulatedFunctionParamsVec.size(), "tabulatedFunctionParameters", false, CL_MEM_READ_ONLY); tabulatedFunctionParams = new OpenCLArray<mm_float4>(cl, tabulatedFunctionParamsVec.size(), "tabulatedFunctionParameters", false, CL_MEM_READ_ONLY);
tabulatedFunctionParams->upload(tabulatedFunctionParamsVec); tabulatedFunctionParams->upload(tabulatedFunctionParamsVec);
cl.getNonbondedUtilities().addArgument(OpenCLNonbondedUtilities::ParameterInfo(prefix+"functionParams", "float4", sizeof(cl_float4), tabulatedFunctionParams->getDeviceBuffer())); cl.getNonbondedUtilities().addArgument(OpenCLNonbondedUtilities::ParameterInfo(prefix+"functionParams", "float", 4, sizeof(cl_float4), tabulatedFunctionParams->getDeviceBuffer()));
} }
// Record information for the expressions. // Record information for the expressions.
...@@ -1479,11 +1479,11 @@ void OpenCLCalcCustomNonbondedForceKernel::initialize(const System& system, cons ...@@ -1479,11 +1479,11 @@ void OpenCLCalcCustomNonbondedForceKernel::initialize(const System& system, cons
cl.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source); cl.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source);
for (int i = 0; i < (int) params->getBuffers().size(); i++) { for (int i = 0; i < (int) params->getBuffers().size(); i++) {
const OpenCLNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i]; const OpenCLNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i];
cl.getNonbondedUtilities().addParameter(OpenCLNonbondedUtilities::ParameterInfo(prefix+"params"+intToString(i+1), buffer.getType(), buffer.getSize(), buffer.getMemory())); cl.getNonbondedUtilities().addParameter(OpenCLNonbondedUtilities::ParameterInfo(prefix+"params"+intToString(i+1), buffer.getComponentType(), buffer.getNumComponents(), buffer.getSize(), buffer.getMemory()));
} }
if (globals != NULL) { if (globals != NULL) {
globals->upload(globalParamValues); globals->upload(globalParamValues);
cl.getNonbondedUtilities().addArgument(OpenCLNonbondedUtilities::ParameterInfo(prefix+"globals", "float", sizeof(cl_float), globals->getDeviceBuffer())); cl.getNonbondedUtilities().addArgument(OpenCLNonbondedUtilities::ParameterInfo(prefix+"globals", "float", 1, sizeof(cl_float), globals->getDeviceBuffer()));
} }
cl.addForce(new OpenCLCustomNonbondedForceInfo(cl.getNonbondedUtilities().getNumForceBuffers(), force)); cl.addForce(new OpenCLCustomNonbondedForceInfo(cl.getNonbondedUtilities().getNumForceBuffers(), force));
} }
...@@ -1559,8 +1559,8 @@ void OpenCLCalcGBSAOBCForceKernel::initialize(const System& system, const GBSAOB ...@@ -1559,8 +1559,8 @@ void OpenCLCalcGBSAOBCForceKernel::initialize(const System& system, const GBSAOB
bool usePeriodic = (force.getNonbondedMethod() != GBSAOBCForce::NoCutoff && force.getNonbondedMethod() != GBSAOBCForce::CutoffNonPeriodic); bool usePeriodic = (force.getNonbondedMethod() != GBSAOBCForce::NoCutoff && force.getNonbondedMethod() != GBSAOBCForce::CutoffNonPeriodic);
string source = OpenCLKernelSources::gbsaObc2; string source = OpenCLKernelSources::gbsaObc2;
nb.addInteraction(useCutoff, usePeriodic, false, force.getCutoffDistance(), vector<vector<int> >(), source); nb.addInteraction(useCutoff, usePeriodic, false, force.getCutoffDistance(), vector<vector<int> >(), source);
nb.addParameter(OpenCLNonbondedUtilities::ParameterInfo("obcParams", "float2", sizeof(cl_float2), params->getDeviceBuffer()));; nb.addParameter(OpenCLNonbondedUtilities::ParameterInfo("obcParams", "float", 2, sizeof(cl_float2), params->getDeviceBuffer()));;
nb.addParameter(OpenCLNonbondedUtilities::ParameterInfo("bornForce", "float", sizeof(cl_float), bornForce->getDeviceBuffer()));; nb.addParameter(OpenCLNonbondedUtilities::ParameterInfo("bornForce", "float", 1, sizeof(cl_float), bornForce->getDeviceBuffer()));;
cl.addForce(new OpenCLGBSAOBCForceInfo(nb.getNumForceBuffers(), force)); cl.addForce(new OpenCLGBSAOBCForceInfo(nb.getNumForceBuffers(), force));
} }
...@@ -1589,42 +1589,39 @@ void OpenCLCalcGBSAOBCForceKernel::executeForces(ContextImpl& context) { ...@@ -1589,42 +1589,39 @@ void OpenCLCalcGBSAOBCForceKernel::executeForces(ContextImpl& context) {
defines["PADDED_NUM_ATOMS"] = intToString(cl.getPaddedNumAtoms()); defines["PADDED_NUM_ATOMS"] = intToString(cl.getPaddedNumAtoms());
string file = (cl.getSIMDWidth() == 32 ? OpenCLKernelSources::gbsaObc_nvidia : OpenCLKernelSources::gbsaObc_default); string file = (cl.getSIMDWidth() == 32 ? OpenCLKernelSources::gbsaObc_nvidia : OpenCLKernelSources::gbsaObc_default);
cl::Program program = cl.createProgram(file, defines); cl::Program program = cl.createProgram(file, defines);
int index = 0;
computeBornSumKernel = cl::Kernel(program, "computeBornSum"); computeBornSumKernel = cl::Kernel(program, "computeBornSum");
computeBornSumKernel.setArg<cl::Buffer>(0, bornSum->getDeviceBuffer()); computeBornSumKernel.setArg<cl::Buffer>(index++, bornSum->getDeviceBuffer());
computeBornSumKernel.setArg(1, OpenCLContext::ThreadBlockSize*sizeof(cl_float), NULL); computeBornSumKernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer());
computeBornSumKernel.setArg<cl::Buffer>(2, cl.getPosq().getDeviceBuffer()); computeBornSumKernel.setArg<cl::Buffer>(index++, params->getDeviceBuffer());
computeBornSumKernel.setArg(3, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL); computeBornSumKernel.setArg(index++, OpenCLContext::ThreadBlockSize*13*sizeof(cl_float), NULL);
computeBornSumKernel.setArg<cl::Buffer>(4, params->getDeviceBuffer()); computeBornSumKernel.setArg(index++, OpenCLContext::ThreadBlockSize*sizeof(cl_float), NULL);
computeBornSumKernel.setArg(5, OpenCLContext::ThreadBlockSize*sizeof(cl_float2), NULL);
computeBornSumKernel.setArg(6, OpenCLContext::ThreadBlockSize*sizeof(cl_float), NULL);
if (nb.getUseCutoff()) { if (nb.getUseCutoff()) {
computeBornSumKernel.setArg<cl::Buffer>(7, nb.getInteractingTiles().getDeviceBuffer()); computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
computeBornSumKernel.setArg<cl::Buffer>(8, nb.getInteractionFlags().getDeviceBuffer()); computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getInteractionFlags().getDeviceBuffer());
computeBornSumKernel.setArg<cl::Buffer>(9, nb.getInteractionCount().getDeviceBuffer()); computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
} }
else { else {
computeBornSumKernel.setArg<cl::Buffer>(7, nb.getTiles().getDeviceBuffer()); computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getTiles().getDeviceBuffer());
computeBornSumKernel.setArg<cl_uint>(8, nb.getTiles().getSize()); computeBornSumKernel.setArg<cl_uint>(index++, nb.getTiles().getSize());
} }
force1Kernel = cl::Kernel(program, "computeGBSAForce1"); force1Kernel = cl::Kernel(program, "computeGBSAForce1");
force1Kernel.setArg<cl::Buffer>(0, cl.getForceBuffers().getDeviceBuffer()); index = 0;
force1Kernel.setArg<cl::Buffer>(1, cl.getEnergyBuffer().getDeviceBuffer()); force1Kernel.setArg<cl::Buffer>(index++, cl.getForceBuffers().getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(2, cl.getPosq().getDeviceBuffer()); force1Kernel.setArg<cl::Buffer>(index++, cl.getEnergyBuffer().getDeviceBuffer());
force1Kernel.setArg(3, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL); force1Kernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer());
force1Kernel.setArg(4, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL); force1Kernel.setArg<cl::Buffer>(index++, bornRadii->getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(5, bornRadii->getDeviceBuffer()); force1Kernel.setArg<cl::Buffer>(index++, bornForce->getDeviceBuffer());
force1Kernel.setArg(6, OpenCLContext::ThreadBlockSize*sizeof(cl_float), NULL); force1Kernel.setArg(index++, OpenCLContext::ThreadBlockSize*13*sizeof(cl_float), NULL);
force1Kernel.setArg<cl::Buffer>(7, bornForce->getDeviceBuffer()); force1Kernel.setArg(index++, OpenCLContext::ThreadBlockSize*sizeof(mm_float4), NULL);
force1Kernel.setArg(8, OpenCLContext::ThreadBlockSize*sizeof(cl_float), NULL);
force1Kernel.setArg(9, OpenCLContext::ThreadBlockSize*sizeof(mm_float4), NULL);
if (nb.getUseCutoff()) { if (nb.getUseCutoff()) {
force1Kernel.setArg<cl::Buffer>(10, nb.getInteractingTiles().getDeviceBuffer()); force1Kernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(11, nb.getInteractionFlags().getDeviceBuffer()); force1Kernel.setArg<cl::Buffer>(index++, nb.getInteractionFlags().getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(12, nb.getInteractionCount().getDeviceBuffer()); force1Kernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
} }
else { else {
force1Kernel.setArg<cl::Buffer>(10, nb.getTiles().getDeviceBuffer()); force1Kernel.setArg<cl::Buffer>(index++, nb.getTiles().getDeviceBuffer());
force1Kernel.setArg<cl_uint>(11, nb.getTiles().getSize()); force1Kernel.setArg<cl_uint>(index++, nb.getTiles().getSize());
} }
program = cl.createProgram(OpenCLKernelSources::gbsaObcReductions, defines); program = cl.createProgram(OpenCLKernelSources::gbsaObcReductions, defines);
reduceBornSumKernel = cl::Kernel(program, "reduceBornSum"); reduceBornSumKernel = cl::Kernel(program, "reduceBornSum");
...@@ -1773,13 +1770,13 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo ...@@ -1773,13 +1770,13 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
vector<mm_float4> f = OpenCLExpressionUtilities::computeFunctionCoefficients(values, interpolating); vector<mm_float4> f = OpenCLExpressionUtilities::computeFunctionCoefficients(values, interpolating);
tabulatedFunctions.push_back(new OpenCLArray<mm_float4>(cl, values.size()-1, "TabulatedFunction")); tabulatedFunctions.push_back(new OpenCLArray<mm_float4>(cl, values.size()-1, "TabulatedFunction"));
tabulatedFunctions[tabulatedFunctions.size()-1]->upload(f); tabulatedFunctions[tabulatedFunctions.size()-1]->upload(f);
cl.getNonbondedUtilities().addArgument(OpenCLNonbondedUtilities::ParameterInfo(arrayName, "float4", sizeof(cl_float4), tabulatedFunctions[tabulatedFunctions.size()-1]->getDeviceBuffer())); cl.getNonbondedUtilities().addArgument(OpenCLNonbondedUtilities::ParameterInfo(arrayName, "float", 4, sizeof(cl_float4), tabulatedFunctions[tabulatedFunctions.size()-1]->getDeviceBuffer()));
tableArgs << ", __global float4* " << arrayName; tableArgs << ", __global float4* " << arrayName;
} }
if (force.getNumFunctions() > 0) { if (force.getNumFunctions() > 0) {
tabulatedFunctionParams = new OpenCLArray<mm_float4>(cl, tabulatedFunctionParamsVec.size(), "tabulatedFunctionParameters", false, CL_MEM_READ_ONLY); tabulatedFunctionParams = new OpenCLArray<mm_float4>(cl, tabulatedFunctionParamsVec.size(), "tabulatedFunctionParameters", false, CL_MEM_READ_ONLY);
tabulatedFunctionParams->upload(tabulatedFunctionParamsVec); tabulatedFunctionParams->upload(tabulatedFunctionParamsVec);
cl.getNonbondedUtilities().addArgument(OpenCLNonbondedUtilities::ParameterInfo(prefix+"functionParams", "float4", sizeof(cl_float4), tabulatedFunctionParams->getDeviceBuffer())); cl.getNonbondedUtilities().addArgument(OpenCLNonbondedUtilities::ParameterInfo(prefix+"functionParams", "float", 4, sizeof(cl_float4), tabulatedFunctionParams->getDeviceBuffer()));
tableArgs << ", __constant float4* " << prefix << "functionParams"; tableArgs << ", __constant float4* " << prefix << "functionParams";
} }
...@@ -2162,21 +2159,21 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo ...@@ -2162,21 +2159,21 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
for (int i = 0; i < (int) params->getBuffers().size(); i++) { for (int i = 0; i < (int) params->getBuffers().size(); i++) {
const OpenCLNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i]; const OpenCLNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i];
string paramName = prefix+"params"+intToString(i+1); string paramName = prefix+"params"+intToString(i+1);
cl.getNonbondedUtilities().addParameter(OpenCLNonbondedUtilities::ParameterInfo(paramName, buffer.getType(), buffer.getSize(), buffer.getMemory())); cl.getNonbondedUtilities().addParameter(OpenCLNonbondedUtilities::ParameterInfo(paramName, buffer.getComponentType(), buffer.getNumComponents(), buffer.getSize(), buffer.getMemory()));
} }
for (int i = 0; i < (int) computedValues->getBuffers().size(); i++) { for (int i = 0; i < (int) computedValues->getBuffers().size(); i++) {
const OpenCLNonbondedUtilities::ParameterInfo& buffer = computedValues->getBuffers()[i]; const OpenCLNonbondedUtilities::ParameterInfo& buffer = computedValues->getBuffers()[i];
string paramName = prefix+"values"+intToString(i+1); string paramName = prefix+"values"+intToString(i+1);
cl.getNonbondedUtilities().addParameter(OpenCLNonbondedUtilities::ParameterInfo(paramName, buffer.getType(), buffer.getSize(), buffer.getMemory())); cl.getNonbondedUtilities().addParameter(OpenCLNonbondedUtilities::ParameterInfo(paramName, buffer.getComponentType(), buffer.getNumComponents(), buffer.getSize(), buffer.getMemory()));
} }
for (int i = 0; i < (int) energyDerivs->getBuffers().size(); i++) { for (int i = 0; i < (int) energyDerivs->getBuffers().size(); i++) {
const OpenCLNonbondedUtilities::ParameterInfo& buffer = energyDerivs->getBuffers()[i]; const OpenCLNonbondedUtilities::ParameterInfo& buffer = energyDerivs->getBuffers()[i];
string paramName = prefix+"dEdV"+intToString(i+1); string paramName = prefix+"dEdV"+intToString(i+1);
cl.getNonbondedUtilities().addParameter(OpenCLNonbondedUtilities::ParameterInfo(paramName, buffer.getType(), buffer.getSize(), buffer.getMemory())); cl.getNonbondedUtilities().addParameter(OpenCLNonbondedUtilities::ParameterInfo(paramName, buffer.getComponentType(), buffer.getNumComponents(), buffer.getSize(), buffer.getMemory()));
} }
if (globals != NULL) { if (globals != NULL) {
globals->upload(globalParamValues); globals->upload(globalParamValues);
cl.getNonbondedUtilities().addArgument(OpenCLNonbondedUtilities::ParameterInfo(prefix+"globals", "float", sizeof(cl_float), globals->getDeviceBuffer())); cl.getNonbondedUtilities().addArgument(OpenCLNonbondedUtilities::ParameterInfo(prefix+"globals", "float", 1, sizeof(cl_float), globals->getDeviceBuffer()));
} }
} }
cl.addForce(new OpenCLCustomGBForceInfo(cl.getNonbondedUtilities().getNumForceBuffers(), force)); cl.addForce(new OpenCLCustomGBForceInfo(cl.getNonbondedUtilities().getNumForceBuffers(), force));
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2009 Stanford University and the Authors. * * Portions copyright (c) 2009-2010 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -283,9 +283,15 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc ...@@ -283,9 +283,15 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
map<string, string> replacements; map<string, string> replacements;
replacements["COMPUTE_INTERACTION"] = source; replacements["COMPUTE_INTERACTION"] = source;
int localDataSize = 7*sizeof(cl_float); int localDataSize = 7*sizeof(cl_float);
const string suffixes[] = {"x", "y", "z", "w"};
stringstream localData; stringstream localData;
for (int i = 0; i < (int) params.size(); i++) { for (int i = 0; i < (int) params.size(); i++) {
localData << params[i].getType() << " " << params[i].getName() << ";\n"; if (params[i].getNumComponents() == 1)
localData<<params[i].getType()<<" "<<params[i].getName()<<";\n";
else {
for (int j = 0; j < params[i].getNumComponents(); ++j)
localData<<params[i].getComponentType()<<" "<<params[i].getName()<<"_"<<suffixes[j]<<";\n";
}
localDataSize += params[i].getSize(); localDataSize += params[i].getSize();
} }
if ((localDataSize/4)%2 == 0) { if ((localDataSize/4)%2 == 0) {
...@@ -318,20 +324,25 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc ...@@ -318,20 +324,25 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
replacements["PARAMETER_ARGUMENTS"] = args.str(); replacements["PARAMETER_ARGUMENTS"] = args.str();
stringstream loadLocal1; stringstream loadLocal1;
for (int i = 0; i < (int) params.size(); i++) { for (int i = 0; i < (int) params.size(); i++) {
loadLocal1 << "localData[get_local_id(0)]."; if (params[i].getNumComponents() == 1) {
loadLocal1 << params[i].getName(); loadLocal1<<"localData[get_local_id(0)]."<<params[i].getName()<<" = "<<params[i].getName()<<"1;\n";
loadLocal1 << " = "; }
loadLocal1 << params[i].getName(); else {
loadLocal1 << "1;\n"; for (int j = 0; j < params[i].getNumComponents(); ++j)
loadLocal1<<"localData[get_local_id(0)]."<<params[i].getName()<<"_"<<suffixes[j]<<" = "<<params[i].getName()<<"1."<<suffixes[j]<<";\n";
}
} }
replacements["LOAD_LOCAL_PARAMETERS_FROM_1"] = loadLocal1.str(); replacements["LOAD_LOCAL_PARAMETERS_FROM_1"] = loadLocal1.str();
stringstream loadLocal2; stringstream loadLocal2;
for (int i = 0; i < (int) params.size(); i++) { for (int i = 0; i < (int) params.size(); i++) {
loadLocal2 << "localData[get_local_id(0)]."; if (params[i].getNumComponents() == 1) {
loadLocal2 << params[i].getName(); loadLocal2<<"localData[get_local_id(0)]."<<params[i].getName()<<" = global_"<<params[i].getName()<<"[j];\n";
loadLocal2 << " = global_"; }
loadLocal2 << params[i].getName(); else {
loadLocal2 << "[j];\n"; loadLocal2<<params[i].getType()<<" temp_"<<params[i].getName()<<" = global_"<<params[i].getName()<<"[j];\n";
for (int j = 0; j < params[i].getNumComponents(); ++j)
loadLocal2<<"localData[get_local_id(0)]."<<params[i].getName()<<"_"<<suffixes[j]<<" = temp_"<<params[i].getName()<<"."<<suffixes[j]<<";\n";
}
} }
replacements["LOAD_LOCAL_PARAMETERS_FROM_GLOBAL"] = loadLocal2.str(); replacements["LOAD_LOCAL_PARAMETERS_FROM_GLOBAL"] = loadLocal2.str();
stringstream load1; stringstream load1;
...@@ -346,12 +357,18 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc ...@@ -346,12 +357,18 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
replacements["LOAD_ATOM1_PARAMETERS"] = load1.str(); replacements["LOAD_ATOM1_PARAMETERS"] = load1.str();
stringstream load2j; stringstream load2j;
for (int i = 0; i < (int) params.size(); i++) { for (int i = 0; i < (int) params.size(); i++) {
load2j << params[i].getType(); if (params[i].getNumComponents() == 1) {
load2j << " "; load2j<<params[i].getType()<<" "<<params[i].getName()<<"2 = localData[atom2]."<<params[i].getName()<<";\n";
load2j << params[i].getName(); }
load2j << "2 = localData[atom2]."; else {
load2j << params[i].getName(); load2j<<params[i].getType()<<" "<<params[i].getName()<<"2 = ("<<params[i].getType()<<") (";
load2j << ";\n"; for (int j = 0; j < params[i].getNumComponents(); ++j) {
if (j > 0)
load2j<<", ";
load2j<<"localData[atom2]."<<params[i].getName()<<"_"<<suffixes[j];
}
load2j<<");\n";
}
} }
replacements["LOAD_ATOM2_PARAMETERS"] = load2j.str(); replacements["LOAD_ATOM2_PARAMETERS"] = load2j.str();
map<string, string> defines; map<string, string> defines;
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2009 Stanford University and the Authors. * * Portions copyright (c) 2009-2010 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -29,6 +29,7 @@ ...@@ -29,6 +29,7 @@
#include "OpenCLContext.h" #include "OpenCLContext.h"
#include "openmm/System.h" #include "openmm/System.h"
#include "OpenCLExpressionUtilities.h"
#include <string> #include <string>
#include <vector> #include <vector>
...@@ -229,19 +230,30 @@ public: ...@@ -229,19 +230,30 @@ public:
* Create a ParameterInfo object. * Create a ParameterInfo object.
* *
* @param name the name of the parameter * @param name the name of the parameter
* @param type the data type of the parameter * @param type the data type of the parameter's components
* @param numComponents the number of components in the parameter
* @param size the size of the parameter in bytes * @param size the size of the parameter in bytes
* @param memory the memory containing the parameter values * @param memory the memory containing the parameter values
*/ */
ParameterInfo(const std::string& name, const std::string& type, int size, cl::Memory& memory) : ParameterInfo(const std::string& name, const std::string& componentType, int numComponents, int size, cl::Memory& memory) :
name(name), type(type), size(size), memory(&memory) { name(name), componentType(componentType), numComponents(numComponents), size(size), memory(&memory) {
if (numComponents == 1)
type = componentType;
else
type = componentType+OpenCLExpressionUtilities::intToString(numComponents);
} }
const std::string& getName() const { const std::string& getName() const {
return name; return name;
} }
const std::string& getComponentType() const {
return componentType;
}
const std::string& getType() const { const std::string& getType() const {
return type; return type;
} }
int getNumComponents() const {
return numComponents;
}
int getSize() const { int getSize() const {
return size; return size;
} }
...@@ -250,8 +262,9 @@ public: ...@@ -250,8 +262,9 @@ public:
} }
private: private:
std::string name; std::string name;
std::string componentType;
std::string type; std::string type;
int size; int size, numComponents;
cl::Memory* memory; cl::Memory* memory;
}; };
......
...@@ -41,21 +41,21 @@ OpenCLParameterSet::OpenCLParameterSet(OpenCLContext& context, int numParameters ...@@ -41,21 +41,21 @@ OpenCLParameterSet::OpenCLParameterSet(OpenCLContext& context, int numParameters
cl::Buffer* buf = new cl::Buffer(context.getContext(), CL_MEM_READ_WRITE, numObjects*sizeof(mm_float4)); cl::Buffer* buf = new cl::Buffer(context.getContext(), CL_MEM_READ_WRITE, numObjects*sizeof(mm_float4));
std::stringstream name; std::stringstream name;
name << "param" << (++bufferCount); name << "param" << (++bufferCount);
buffers.push_back(OpenCLNonbondedUtilities::ParameterInfo(name.str(), "float4", sizeof(mm_float4), *buf)); buffers.push_back(OpenCLNonbondedUtilities::ParameterInfo(name.str(), "float", 4, sizeof(mm_float4), *buf));
params -= 4; params -= 4;
} }
if (params > 1) { if (params > 1) {
cl::Buffer* buf = new cl::Buffer(context.getContext(), CL_MEM_READ_WRITE, numObjects*sizeof(mm_float2)); cl::Buffer* buf = new cl::Buffer(context.getContext(), CL_MEM_READ_WRITE, numObjects*sizeof(mm_float2));
std::stringstream name; std::stringstream name;
name << "param" << (++bufferCount); name << "param" << (++bufferCount);
buffers.push_back(OpenCLNonbondedUtilities::ParameterInfo(name.str(), "float2", sizeof(mm_float2), *buf)); buffers.push_back(OpenCLNonbondedUtilities::ParameterInfo(name.str(), "float", 2, sizeof(mm_float2), *buf));
params -= 2; params -= 2;
} }
if (params > 0) { if (params > 0) {
cl::Buffer* buf = new cl::Buffer(context.getContext(), CL_MEM_READ_WRITE, numObjects*sizeof(cl_float)); cl::Buffer* buf = new cl::Buffer(context.getContext(), CL_MEM_READ_WRITE, numObjects*sizeof(cl_float));
std::stringstream name; std::stringstream name;
name << "param" << (++bufferCount); name << "param" << (++bufferCount);
buffers.push_back(OpenCLNonbondedUtilities::ParameterInfo(name.str(), "float", sizeof(cl_float), *buf)); buffers.push_back(OpenCLNonbondedUtilities::ParameterInfo(name.str(), "float", 1, sizeof(cl_float), *buf));
} }
} }
catch (cl::Error err) { catch (cl::Error err) {
......
#define TILE_SIZE 32 #define TILE_SIZE 32
typedef struct {
float x, y, z;
float q;
float fx, fy, fz, fw;
float radius, scaledRadius;
float bornSum;
float bornRadius;
float bornForce;
} AtomData;
/** /**
* Compute the Born sum. * Compute the Born sum.
*/ */
__kernel void computeBornSum(__global float* global_bornSum, __local float* local_bornSum, __global float4* posq, __kernel void computeBornSum(__global float* global_bornSum, __global float4* posq, __global float2* global_params, __local AtomData* localData, __local float* tempBuffer, __global unsigned int* tiles,
__local float4* local_posq, __global float2* global_params, __local float2* local_params, __local float* tempBuffer, __global unsigned int* tiles,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global unsigned int* interactionFlags, __global unsigned int* interactionCount) { __global unsigned int* interactionFlags, __global unsigned int* interactionCount) {
#else #else
...@@ -34,13 +43,17 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca ...@@ -34,13 +43,17 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
if (x == y) { if (x == y) {
// This tile is on the diagonal. // This tile is on the diagonal.
local_posq[get_local_id(0)] = posq1; localData[get_local_id(0)].x = posq1.x;
local_params[get_local_id(0)] = params1; localData[get_local_id(0)].y = posq1.y;
localData[get_local_id(0)].z = posq1.z;
localData[get_local_id(0)].q = posq1.w;
localData[get_local_id(0)].radius = params1.x;
localData[get_local_id(0)].scaledRadius = params1.y;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
unsigned int xi = x/TILE_SIZE; unsigned int xi = x/TILE_SIZE;
unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*(xi+1)/2; unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*(xi+1)/2;
for (unsigned int j = 0; j < TILE_SIZE/2; j++) { for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
float4 delta = (float4) (local_posq[baseLocalAtom+j].xyz - posq1.xyz, 0.0f); float4 delta = (float4) (localData[baseLocalAtom+j].x-posq1.x, localData[baseLocalAtom+j].y-posq1.y, localData[baseLocalAtom+j].z-posq1.z, 0.0f);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*INV_PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X; delta.x -= floor(delta.x*INV_PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
delta.y -= floor(delta.y*INV_PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y; delta.y -= floor(delta.y*INV_PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y;
...@@ -54,7 +67,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca ...@@ -54,7 +67,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
#endif #endif
float invR = native_rsqrt(r2); float invR = native_rsqrt(r2);
float r = native_recip(invR); float r = native_recip(invR);
float2 params2 = local_params[baseLocalAtom+j]; float2 params2 = (float2) (localData[baseLocalAtom+j].radius, localData[baseLocalAtom+j].scaledRadius);
float rScaledRadiusJ = r+params2.y; float rScaledRadiusJ = r+params2.y;
if ((j != tgx) && (params1.x < rScaledRadiusJ)) { if ((j != tgx) && (params1.x < rScaledRadiusJ)) {
float l_ij = 1.0f/max(params1.x, fabs(r-params2.y)); float l_ij = 1.0f/max(params1.x, fabs(r-params2.y));
...@@ -89,10 +102,16 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca ...@@ -89,10 +102,16 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
if (lasty != y && get_local_id(0) < TILE_SIZE) { if (lasty != y && get_local_id(0) < TILE_SIZE) {
unsigned int j = y + tgx; unsigned int j = y + tgx;
local_posq[get_local_id(0)] = posq[j]; float4 tempPosq = posq[j];
local_params[get_local_id(0)] = global_params[j]; localData[get_local_id(0)].x = tempPosq.x;
localData[get_local_id(0)].y = tempPosq.y;
localData[get_local_id(0)].z = tempPosq.z;
localData[get_local_id(0)].q = tempPosq.w;
float2 tempParams = global_params[j];
localData[get_local_id(0)].radius = tempParams.x;
localData[get_local_id(0)].scaledRadius = tempParams.y;
} }
local_bornSum[get_local_id(0)] = 0.0f; localData[get_local_id(0)].bornSum = 0.0f;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
// Compute the full set of interactions in this tile. // Compute the full set of interactions in this tile.
...@@ -102,7 +121,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca ...@@ -102,7 +121,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
unsigned int tile = xi+yi*PADDED_NUM_ATOMS/TILE_SIZE-yi*(yi+1)/2; unsigned int tile = xi+yi*PADDED_NUM_ATOMS/TILE_SIZE-yi*(yi+1)/2;
unsigned int tj = tgx%(TILE_SIZE/2); unsigned int tj = tgx%(TILE_SIZE/2);
for (unsigned int j = 0; j < TILE_SIZE/2; j++) { for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
float4 delta = (float4) (local_posq[baseLocalAtom+tj].xyz - posq1.xyz, 0.0f); float4 delta = (float4) (localData[baseLocalAtom+tj].x-posq1.x, localData[baseLocalAtom+tj].y-posq1.y, localData[baseLocalAtom+tj].z-posq1.z, 0.0f);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*INV_PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X; delta.x -= floor(delta.x*INV_PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
delta.y -= floor(delta.y*INV_PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y; delta.y -= floor(delta.y*INV_PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y;
...@@ -116,7 +135,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca ...@@ -116,7 +135,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
#endif #endif
float invR = native_rsqrt(r2); float invR = native_rsqrt(r2);
float r = native_recip(invR); float r = native_recip(invR);
float2 params2 = local_params[baseLocalAtom+tj]; float2 params2 = (float2) (localData[baseLocalAtom+tj].radius, localData[baseLocalAtom+tj].scaledRadius);
float rScaledRadiusJ = r+params2.y; float rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) { if (params1.x < rScaledRadiusJ) {
float l_ij = 1.0f/max(params1.x, fabs(r-params2.y)); float l_ij = 1.0f/max(params1.x, fabs(r-params2.y));
...@@ -140,7 +159,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca ...@@ -140,7 +159,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
(0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2); (0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2);
if (params2.x < params1.x-r) if (params2.x < params1.x-r)
term += 2.0f*(1.0f/params2.x-l_ij); term += 2.0f*(1.0f/params2.x-l_ij);
local_bornSum[baseLocalAtom+tj+forceBufferOffset] += term; localData[baseLocalAtom+tj+forceBufferOffset].bornSum += term;
} }
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
...@@ -161,7 +180,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca ...@@ -161,7 +180,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
unsigned int offset2 = y + tgx + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset2 = y + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif #endif
global_bornSum[offset1] += bornSum+tempBuffer[get_local_id(0)+TILE_SIZE]; global_bornSum[offset1] += bornSum+tempBuffer[get_local_id(0)+TILE_SIZE];
global_bornSum[offset2] += local_bornSum[get_local_id(0)]+local_bornSum[get_local_id(0)+TILE_SIZE]; global_bornSum[offset2] += localData[get_local_id(0)].bornSum+localData[get_local_id(0)+TILE_SIZE].bornSum;
} }
lasty = y; lasty = y;
} }
...@@ -174,8 +193,8 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca ...@@ -174,8 +193,8 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
*/ */
__kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer, __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer,
__global float4* posq, __local float4* local_posq, __local float4* local_force, __global float* global_bornRadii, __local float* local_bornRadii, __global float4* posq, __global float* global_bornRadii,
__global float* global_bornForce, __local float* local_bornForce, __local float4* tempBuffer, __global unsigned int* tiles, __global float* global_bornForce, __local AtomData* localData, __local float4* tempBuffer, __global unsigned int* tiles,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global unsigned int* interactionFlags, __global unsigned int* interactionCount) { __global unsigned int* interactionFlags, __global unsigned int* interactionCount) {
#else #else
...@@ -204,14 +223,17 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e ...@@ -204,14 +223,17 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
if (x == y) { if (x == y) {
// This tile is on the diagonal. // This tile is on the diagonal.
local_posq[get_local_id(0)] = posq1; localData[get_local_id(0)].x = posq1.x;
local_bornRadii[get_local_id(0)] = bornRadius1; localData[get_local_id(0)].y = posq1.y;
localData[get_local_id(0)].z = posq1.z;
localData[get_local_id(0)].q = posq1.w;
localData[get_local_id(0)].bornRadius = bornRadius1;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
unsigned int xi = x/TILE_SIZE; unsigned int xi = x/TILE_SIZE;
unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*(xi+1)/2; unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*(xi+1)/2;
for (unsigned int j = 0; j < TILE_SIZE/2; j++) { for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
if (atom1 < NUM_ATOMS && y+baseLocalAtom+j < NUM_ATOMS) { if (atom1 < NUM_ATOMS && y+baseLocalAtom+j < NUM_ATOMS) {
float4 posq2 = local_posq[baseLocalAtom+j]; float4 posq2 = (float4) (localData[baseLocalAtom+j].x, localData[baseLocalAtom+j].y, localData[baseLocalAtom+j].z, localData[baseLocalAtom+j].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*INV_PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X; delta.x -= floor(delta.x*INV_PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
...@@ -221,7 +243,7 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e ...@@ -221,7 +243,7 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float invR = native_rsqrt(r2); float invR = native_rsqrt(r2);
float r = native_recip(invR); float r = native_recip(invR);
float bornRadius2 = local_bornRadii[baseLocalAtom+j]; float bornRadius2 = localData[baseLocalAtom+j].bornRadius;
float alpha2_ij = bornRadius1*bornRadius2; float alpha2_ij = bornRadius1*bornRadius2;
float D_ij = r2/(4.0f*alpha2_ij); float D_ij = r2/(4.0f*alpha2_ij);
float expTerm = exp(-D_ij); float expTerm = exp(-D_ij);
...@@ -264,10 +286,17 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e ...@@ -264,10 +286,17 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
if (lasty != y && get_local_id(0) < TILE_SIZE) { if (lasty != y && get_local_id(0) < TILE_SIZE) {
unsigned int j = y + tgx; unsigned int j = y + tgx;
local_posq[get_local_id(0)] = posq[j]; float4 tempPosq = posq[j];
local_bornRadii[get_local_id(0)] = global_bornRadii[j]; localData[get_local_id(0)].x = tempPosq.x;
localData[get_local_id(0)].y = tempPosq.y;
localData[get_local_id(0)].z = tempPosq.z;
localData[get_local_id(0)].q = tempPosq.w;
localData[get_local_id(0)].bornRadius = global_bornRadii[j];
} }
local_force[get_local_id(0)] = 0.0f; localData[get_local_id(0)].fx = 0.0f;
localData[get_local_id(0)].fy = 0.0f;
localData[get_local_id(0)].fz = 0.0f;
localData[get_local_id(0)].fw = 0.0f;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
// Compute the full set of interactions in this tile. // Compute the full set of interactions in this tile.
...@@ -278,7 +307,7 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e ...@@ -278,7 +307,7 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
unsigned int tj = tgx%(TILE_SIZE/2); unsigned int tj = tgx%(TILE_SIZE/2);
for (unsigned int j = 0; j < TILE_SIZE/2; j++) { for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
if (atom1 < NUM_ATOMS && y+baseLocalAtom+tj < NUM_ATOMS) { if (atom1 < NUM_ATOMS && y+baseLocalAtom+tj < NUM_ATOMS) {
float4 posq2 = local_posq[baseLocalAtom+tj]; float4 posq2 = (float4) (localData[baseLocalAtom+tj].x, localData[baseLocalAtom+tj].y, localData[baseLocalAtom+tj].z, localData[baseLocalAtom+tj].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*INV_PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X; delta.x -= floor(delta.x*INV_PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
...@@ -288,7 +317,7 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e ...@@ -288,7 +317,7 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float invR = native_rsqrt(r2); float invR = native_rsqrt(r2);
float r = native_recip(invR); float r = native_recip(invR);
float bornRadius2 = local_bornRadii[baseLocalAtom+tj]; float bornRadius2 = localData[baseLocalAtom+tj].bornRadius;
float alpha2_ij = bornRadius1*bornRadius2; float alpha2_ij = bornRadius1*bornRadius2;
float D_ij = r2/(4.0f*alpha2_ij); float D_ij = r2/(4.0f*alpha2_ij);
float expTerm = exp(-D_ij); float expTerm = exp(-D_ij);
...@@ -308,7 +337,10 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e ...@@ -308,7 +337,10 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
energy += tempEnergy; energy += tempEnergy;
delta.xyz *= dEdR; delta.xyz *= dEdR;
force.xyz -= delta.xyz; force.xyz -= delta.xyz;
local_force[baseLocalAtom+tj+forceBufferOffset] += (float4) (delta.xyz, dGpol_dalpha2_ij*bornRadius1); localData[baseLocalAtom+tj+forceBufferOffset].fx += delta.x;
localData[baseLocalAtom+tj+forceBufferOffset].fy += delta.y;
localData[baseLocalAtom+tj+forceBufferOffset].fz += delta.z;
localData[baseLocalAtom+tj+forceBufferOffset].fw += dGpol_dalpha2_ij*bornRadius1;
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
tj = (tj+1)%(TILE_SIZE/2); tj = (tj+1)%(TILE_SIZE/2);
...@@ -328,9 +360,13 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e ...@@ -328,9 +360,13 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
unsigned int offset2 = y + tgx + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset2 = y + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif #endif
forceBuffers[offset1].xyz = forceBuffers[offset1].xyz+force.xyz+tempBuffer[get_local_id(0)+TILE_SIZE].xyz; forceBuffers[offset1].xyz = forceBuffers[offset1].xyz+force.xyz+tempBuffer[get_local_id(0)+TILE_SIZE].xyz;
forceBuffers[offset2].xyz = forceBuffers[offset2].xyz+local_force[get_local_id(0)].xyz+local_force[get_local_id(0)+TILE_SIZE].xyz; float4 sum = (float4) (localData[get_local_id(0)].fx+localData[get_local_id(0)+TILE_SIZE].fx,
localData[get_local_id(0)].fy+localData[get_local_id(0)+TILE_SIZE].fy,
localData[get_local_id(0)].fz+localData[get_local_id(0)+TILE_SIZE].fz,
localData[get_local_id(0)].fw+localData[get_local_id(0)+TILE_SIZE].fw);
forceBuffers[offset2].xyz = forceBuffers[offset2].xyz+sum.xyz;
global_bornForce[offset1] += force.w+tempBuffer[get_local_id(0)+TILE_SIZE].w; global_bornForce[offset1] += force.w+tempBuffer[get_local_id(0)+TILE_SIZE].w;
global_bornForce[offset2] += local_force[get_local_id(0)].w+local_force[get_local_id(0)+TILE_SIZE].w; global_bornForce[offset2] += sum.w;
} }
lasty = y; lasty = y;
} }
......
#define TILE_SIZE 32 #define TILE_SIZE 32
typedef struct {
float x, y, z;
float q;
float fx, fy, fz, fw;
float radius, scaledRadius;
float bornSum;
float bornRadius;
float bornForce;
} AtomData;
/** /**
* Compute the Born sum. * Compute the Born sum.
*/ */
__kernel void computeBornSum(__global float* global_bornSum, __local float* local_bornSum, __global float4* posq, __kernel void computeBornSum(__global float* global_bornSum, __global float4* posq, __global float2* global_params, __local AtomData* localData, __local float* tempBuffer, __global unsigned int* tiles,
__local float4* local_posq, __global float2* global_params, __local float2* local_params, __local float* tempBuffer, __global unsigned int* tiles,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global unsigned int* interactionFlags, __global unsigned int* interactionCount) { __global unsigned int* interactionFlags, __global unsigned int* interactionCount) {
#else #else
...@@ -35,12 +44,16 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca ...@@ -35,12 +44,16 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
if (x == y) { if (x == y) {
// This tile is on the diagonal. // This tile is on the diagonal.
local_posq[get_local_id(0)] = posq1; localData[get_local_id(0)].x = posq1.x;
local_params[get_local_id(0)] = params1; localData[get_local_id(0)].y = posq1.y;
localData[get_local_id(0)].z = posq1.z;
localData[get_local_id(0)].q = posq1.w;
localData[get_local_id(0)].radius = params1.x;
localData[get_local_id(0)].scaledRadius = params1.y;
unsigned int xi = x/TILE_SIZE; unsigned int xi = x/TILE_SIZE;
unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*(xi+1)/2; unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*(xi+1)/2;
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
float4 delta = (float4) (local_posq[tbx+j].xyz - posq1.xyz, 0.0f); float4 delta = (float4) (localData[tbx+j].x-posq1.x, localData[tbx+j].y-posq1.y, localData[tbx+j].z-posq1.z, 0.0f);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*INV_PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X; delta.x -= floor(delta.x*INV_PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
delta.y -= floor(delta.y*INV_PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y; delta.y -= floor(delta.y*INV_PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y;
...@@ -54,7 +67,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca ...@@ -54,7 +67,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
#endif #endif
float invR = native_rsqrt(r2); float invR = native_rsqrt(r2);
float r = native_recip(invR); float r = native_recip(invR);
float2 params2 = local_params[tbx+j]; float2 params2 = (float2) (localData[tbx+j].radius, localData[tbx+j].scaledRadius);
float rScaledRadiusJ = r+params2.y; float rScaledRadiusJ = r+params2.y;
if ((j != tgx) && (params1.x < rScaledRadiusJ)) { if ((j != tgx) && (params1.x < rScaledRadiusJ)) {
float l_ij = 1.0f/max(params1.x, fabs(r-params2.y)); float l_ij = 1.0f/max(params1.x, fabs(r-params2.y));
...@@ -83,10 +96,16 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca ...@@ -83,10 +96,16 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
if (lasty != y) { if (lasty != y) {
unsigned int j = y + tgx; unsigned int j = y + tgx;
local_posq[get_local_id(0)] = posq[j]; float4 tempPosq = posq[j];
local_params[get_local_id(0)] = global_params[j]; localData[get_local_id(0)].x = tempPosq.x;
} localData[get_local_id(0)].y = tempPosq.y;
local_bornSum[get_local_id(0)] = 0.0f; localData[get_local_id(0)].z = tempPosq.z;
localData[get_local_id(0)].q = tempPosq.w;
float2 tempParams = global_params[j];
localData[get_local_id(0)].radius = tempParams.x;
localData[get_local_id(0)].scaledRadius = tempParams.y;
}
localData[get_local_id(0)].bornSum = 0.0f;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int flags = interactionFlags[pos]; unsigned int flags = interactionFlags[pos];
if (flags != 0xFFFFFFFF) { if (flags != 0xFFFFFFFF) {
...@@ -98,7 +117,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca ...@@ -98,7 +117,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) { if ((flags&(1<<j)) != 0) {
float4 delta = (float4) (local_posq[tbx+j].xyz - posq1.xyz, 0.0f); float4 delta = (float4) (localData[tbx+j].x-posq1.x, localData[tbx+j].y-posq1.y, localData[tbx+j].z-posq1.z, 0.0f);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*INV_PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X; delta.x -= floor(delta.x*INV_PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
delta.y -= floor(delta.y*INV_PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y; delta.y -= floor(delta.y*INV_PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y;
...@@ -113,7 +132,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca ...@@ -113,7 +132,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
#endif #endif
float invR = native_rsqrt(r2); float invR = native_rsqrt(r2);
float r = native_recip(invR); float r = native_recip(invR);
float2 params2 = local_params[tbx+j]; float2 params2 = (float2) (localData[tbx+j].radius, localData[tbx+j].scaledRadius);
float rScaledRadiusJ = r+params2.y; float rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) { if (params1.x < rScaledRadiusJ) {
float l_ij = 1.0f/max(params1.x, fabs(r-params2.y)); float l_ij = 1.0f/max(params1.x, fabs(r-params2.y));
...@@ -152,7 +171,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca ...@@ -152,7 +171,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
if (tgx % 16 == 0) if (tgx % 16 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+8]; tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+8];
if (tgx == 0) if (tgx == 0)
local_bornSum[tbx+j] += tempBuffer[get_local_id(0)] + tempBuffer[get_local_id(0)+16]; localData[tbx+j].bornSum += tempBuffer[get_local_id(0)] + tempBuffer[get_local_id(0)+16];
} }
} }
} }
...@@ -167,7 +186,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca ...@@ -167,7 +186,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
unsigned int tile = xi+yi*PADDED_NUM_ATOMS/TILE_SIZE-yi*(yi+1)/2; unsigned int tile = xi+yi*PADDED_NUM_ATOMS/TILE_SIZE-yi*(yi+1)/2;
unsigned int tj = tgx; unsigned int tj = tgx;
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
float4 delta = (float4) (local_posq[tbx+tj].xyz - posq1.xyz, 0.0f); float4 delta = (float4) (localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z, 0.0f);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*INV_PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X; delta.x -= floor(delta.x*INV_PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
delta.y -= floor(delta.y*INV_PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y; delta.y -= floor(delta.y*INV_PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y;
...@@ -181,7 +200,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca ...@@ -181,7 +200,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
#endif #endif
float invR = native_rsqrt(r2); float invR = native_rsqrt(r2);
float r = native_recip(invR); float r = native_recip(invR);
float2 params2 = local_params[tbx+tj]; float2 params2 = (float2) (localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
float rScaledRadiusJ = r+params2.y; float rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) { if (params1.x < rScaledRadiusJ) {
float l_ij = 1.0f/max(params1.x, fabs(r-params2.y)); float l_ij = 1.0f/max(params1.x, fabs(r-params2.y));
...@@ -205,7 +224,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca ...@@ -205,7 +224,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
(0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2); (0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2);
if (params2.x < params1.x-r) if (params2.x < params1.x-r)
term += 2.0f*(1.0f/params2.x-l_ij); term += 2.0f*(1.0f/params2.x-l_ij);
local_bornSum[tbx+tj] += term; localData[tbx+tj].bornSum += term;
} }
} }
tj = (tj + 1) & (TILE_SIZE - 1); tj = (tj + 1) & (TILE_SIZE - 1);
...@@ -222,7 +241,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca ...@@ -222,7 +241,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
unsigned int offset2 = y + tgx + warp*PADDED_NUM_ATOMS; unsigned int offset2 = y + tgx + warp*PADDED_NUM_ATOMS;
#endif #endif
global_bornSum[offset1] += bornSum; global_bornSum[offset1] += bornSum;
global_bornSum[offset2] += local_bornSum[get_local_id(0)]; global_bornSum[offset2] += localData[get_local_id(0)].bornSum;
lasty = y; lasty = y;
} }
pos++; pos++;
...@@ -234,8 +253,8 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca ...@@ -234,8 +253,8 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
*/ */
__kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer, __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer,
__global float4* posq, __local float4* local_posq, __local float4* local_force, __global float* global_bornRadii, __local float* local_bornRadii, __global float4* posq, __global float* global_bornRadii,
__global float* global_bornForce, __local float* local_bornForce, __local float4* tempBuffer, __global unsigned int* tiles, __global float* global_bornForce, __local AtomData* localData, __local float4* tempBuffer, __global unsigned int* tiles,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global unsigned int* interactionFlags, __global unsigned int* interactionCount) { __global unsigned int* interactionFlags, __global unsigned int* interactionCount) {
#else #else
...@@ -265,13 +284,16 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e ...@@ -265,13 +284,16 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
if (x == y) { if (x == y) {
// This tile is on the diagonal. // This tile is on the diagonal.
local_posq[get_local_id(0)] = posq1; localData[get_local_id(0)].x = posq1.x;
local_bornRadii[get_local_id(0)] = bornRadius1; localData[get_local_id(0)].y = posq1.y;
localData[get_local_id(0)].z = posq1.z;
localData[get_local_id(0)].q = posq1.w;
localData[get_local_id(0)].bornRadius = bornRadius1;
unsigned int xi = x/TILE_SIZE; unsigned int xi = x/TILE_SIZE;
unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*(xi+1)/2; unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*(xi+1)/2;
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
if (atom1 < NUM_ATOMS && y+j < NUM_ATOMS) { if (atom1 < NUM_ATOMS && y+j < NUM_ATOMS) {
float4 posq2 = local_posq[tbx+j]; float4 posq2 = (float4) (localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z, localData[tbx+j].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*INV_PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X; delta.x -= floor(delta.x*INV_PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
...@@ -281,7 +303,7 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e ...@@ -281,7 +303,7 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float invR = native_rsqrt(r2); float invR = native_rsqrt(r2);
float r = native_recip(invR); float r = native_recip(invR);
float bornRadius2 = local_bornRadii[tbx+j]; float bornRadius2 = localData[tbx+j].bornRadius;
float alpha2_ij = bornRadius1*bornRadius2; float alpha2_ij = bornRadius1*bornRadius2;
float D_ij = r2/(4.0f*alpha2_ij); float D_ij = r2/(4.0f*alpha2_ij);
float expTerm = exp(-D_ij); float expTerm = exp(-D_ij);
...@@ -318,10 +340,17 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e ...@@ -318,10 +340,17 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
if (lasty != y) { if (lasty != y) {
unsigned int j = y + tgx; unsigned int j = y + tgx;
local_posq[get_local_id(0)] = posq[j]; float4 tempPosq = posq[j];
local_bornRadii[get_local_id(0)] = global_bornRadii[j]; localData[get_local_id(0)].x = tempPosq.x;
} localData[get_local_id(0)].y = tempPosq.y;
local_force[get_local_id(0)] = 0.0f; localData[get_local_id(0)].z = tempPosq.z;
localData[get_local_id(0)].q = tempPosq.w;
localData[get_local_id(0)].bornRadius = global_bornRadii[j];
}
localData[get_local_id(0)].fx = 0.0f;
localData[get_local_id(0)].fy = 0.0f;
localData[get_local_id(0)].fz = 0.0f;
localData[get_local_id(0)].fw = 0.0f;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int flags = interactionFlags[pos]; unsigned int flags = interactionFlags[pos];
if (flags != 0xFFFFFFFF) { if (flags != 0xFFFFFFFF) {
...@@ -333,7 +362,7 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e ...@@ -333,7 +362,7 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) { if ((flags&(1<<j)) != 0) {
float4 posq2 = local_posq[tbx+j]; float4 posq2 = (float4) (localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z, localData[tbx+j].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*INV_PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X; delta.x -= floor(delta.x*INV_PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
...@@ -343,7 +372,7 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e ...@@ -343,7 +372,7 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float invR = native_rsqrt(r2); float invR = native_rsqrt(r2);
float r = native_recip(invR); float r = native_recip(invR);
float bornRadius2 = local_bornRadii[tbx+j]; float bornRadius2 = localData[tbx+j].bornRadius;
float alpha2_ij = bornRadius1*bornRadius2; float alpha2_ij = bornRadius1*bornRadius2;
float D_ij = r2/(4.0f*alpha2_ij); float D_ij = r2/(4.0f*alpha2_ij);
float expTerm = exp(-D_ij); float expTerm = exp(-D_ij);
...@@ -377,8 +406,13 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e ...@@ -377,8 +406,13 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+4]; tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+4];
if (tgx % 16 == 0) if (tgx % 16 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+8]; tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+8];
if (tgx == 0) if (tgx == 0) {
local_force[tbx+j] += tempBuffer[get_local_id(0)] + tempBuffer[get_local_id(0)+16]; float4 sum = tempBuffer[get_local_id(0)] + tempBuffer[get_local_id(0)+16];
localData[tbx+j].fx += sum.x;
localData[tbx+j].fy += sum.y;
localData[tbx+j].fz += sum.z;
localData[tbx+j].fw += sum.w;
}
} }
} }
} }
...@@ -394,7 +428,7 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e ...@@ -394,7 +428,7 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
unsigned int tj = tgx; unsigned int tj = tgx;
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
if (atom1 < NUM_ATOMS && y+tj < NUM_ATOMS) { if (atom1 < NUM_ATOMS && y+tj < NUM_ATOMS) {
float4 posq2 = local_posq[tbx+tj]; float4 posq2 = (float4) (localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z, localData[tbx+tj].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*INV_PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X; delta.x -= floor(delta.x*INV_PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
...@@ -404,7 +438,7 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e ...@@ -404,7 +438,7 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float invR = native_rsqrt(r2); float invR = native_rsqrt(r2);
float r = native_recip(invR); float r = native_recip(invR);
float bornRadius2 = local_bornRadii[tbx+tj]; float bornRadius2 = localData[tbx+tj].bornRadius;
float alpha2_ij = bornRadius1*bornRadius2; float alpha2_ij = bornRadius1*bornRadius2;
float D_ij = r2/(4.0f*alpha2_ij); float D_ij = r2/(4.0f*alpha2_ij);
float expTerm = exp(-D_ij); float expTerm = exp(-D_ij);
...@@ -424,7 +458,10 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e ...@@ -424,7 +458,10 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
energy += tempEnergy; energy += tempEnergy;
delta.xyz *= dEdR; delta.xyz *= dEdR;
force.xyz -= delta.xyz; force.xyz -= delta.xyz;
local_force[tbx+tj] += (float4) (delta.xyz, dGpol_dalpha2_ij*bornRadius1); localData[tbx+tj].fx += delta.x;
localData[tbx+tj].fy += delta.y;
localData[tbx+tj].fz += delta.z;
localData[tbx+tj].fw += dGpol_dalpha2_ij*bornRadius1;
} }
tj = (tj + 1) & (TILE_SIZE - 1); tj = (tj + 1) & (TILE_SIZE - 1);
} }
...@@ -439,9 +476,9 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e ...@@ -439,9 +476,9 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
unsigned int offset2 = y + tgx + warp*PADDED_NUM_ATOMS; unsigned int offset2 = y + tgx + warp*PADDED_NUM_ATOMS;
#endif #endif
forceBuffers[offset1].xyz += force.xyz; forceBuffers[offset1].xyz += force.xyz;
forceBuffers[offset2].xyz += local_force[get_local_id(0)].xyz; forceBuffers[offset2] += (float4) (localData[get_local_id(0)].fx, localData[get_local_id(0)].fy, localData[get_local_id(0)].fz, 0);
global_bornForce[offset1] += force.w; global_bornForce[offset1] += force.w;
global_bornForce[offset2] += local_force[get_local_id(0)].w; global_bornForce[offset2] += localData[get_local_id(0)].fw;
lasty = y; lasty = y;
} }
pos++; pos++;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment