Commit a0651b5b authored by Peter Eastman's avatar Peter Eastman
Browse files

Eliminated more local memory bank conflicts

parent 7b3a8266
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-2009 Stanford University and the Authors. *
* Portions copyright (c) 2008-2010 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -1216,7 +1216,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
for (int i = 0; i < tableSize; ++i)
erfcVector[i] = (float) erfc(i*(alpha*force.getCutoffDistance())/(tableSize-1));
erfcTable->upload(erfcVector);
cl.getNonbondedUtilities().addArgument(OpenCLNonbondedUtilities::ParameterInfo("erfcTable", "float", sizeof(cl_float), erfcTable->getDeviceBuffer()));
cl.getNonbondedUtilities().addArgument(OpenCLNonbondedUtilities::ParameterInfo("erfcTable", "float", 1, sizeof(cl_float), erfcTable->getDeviceBuffer()));
}
// Add the interaction to the default nonbonded kernel.
......@@ -1224,7 +1224,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
string source = cl.replaceStrings(OpenCLKernelSources::coulombLennardJones, defines);
cl.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source);
if (hasLJ)
cl.getNonbondedUtilities().addParameter(OpenCLNonbondedUtilities::ParameterInfo("sigmaEpsilon", "float2", sizeof(cl_float2), sigmaEpsilon->getDeviceBuffer()));
cl.getNonbondedUtilities().addParameter(OpenCLNonbondedUtilities::ParameterInfo("sigmaEpsilon", "float", 2, sizeof(cl_float2), sigmaEpsilon->getDeviceBuffer()));
// Initialize the exceptions.
......@@ -1431,12 +1431,12 @@ void OpenCLCalcCustomNonbondedForceKernel::initialize(const System& system, cons
vector<mm_float4> f = OpenCLExpressionUtilities::computeFunctionCoefficients(values, interpolating);
tabulatedFunctions.push_back(new OpenCLArray<mm_float4>(cl, values.size()-1, "TabulatedFunction"));
tabulatedFunctions[tabulatedFunctions.size()-1]->upload(f);
cl.getNonbondedUtilities().addArgument(OpenCLNonbondedUtilities::ParameterInfo(arrayName, "float4", sizeof(cl_float4), tabulatedFunctions[tabulatedFunctions.size()-1]->getDeviceBuffer()));
cl.getNonbondedUtilities().addArgument(OpenCLNonbondedUtilities::ParameterInfo(arrayName, "float", 4, sizeof(cl_float4), tabulatedFunctions[tabulatedFunctions.size()-1]->getDeviceBuffer()));
}
if (force.getNumFunctions() > 0) {
tabulatedFunctionParams = new OpenCLArray<mm_float4>(cl, tabulatedFunctionParamsVec.size(), "tabulatedFunctionParameters", false, CL_MEM_READ_ONLY);
tabulatedFunctionParams->upload(tabulatedFunctionParamsVec);
cl.getNonbondedUtilities().addArgument(OpenCLNonbondedUtilities::ParameterInfo(prefix+"functionParams", "float4", sizeof(cl_float4), tabulatedFunctionParams->getDeviceBuffer()));
cl.getNonbondedUtilities().addArgument(OpenCLNonbondedUtilities::ParameterInfo(prefix+"functionParams", "float", 4, sizeof(cl_float4), tabulatedFunctionParams->getDeviceBuffer()));
}
// Record information for the expressions.
......@@ -1479,11 +1479,11 @@ void OpenCLCalcCustomNonbondedForceKernel::initialize(const System& system, cons
cl.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source);
for (int i = 0; i < (int) params->getBuffers().size(); i++) {
const OpenCLNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i];
cl.getNonbondedUtilities().addParameter(OpenCLNonbondedUtilities::ParameterInfo(prefix+"params"+intToString(i+1), buffer.getType(), buffer.getSize(), buffer.getMemory()));
cl.getNonbondedUtilities().addParameter(OpenCLNonbondedUtilities::ParameterInfo(prefix+"params"+intToString(i+1), buffer.getComponentType(), buffer.getNumComponents(), buffer.getSize(), buffer.getMemory()));
}
if (globals != NULL) {
globals->upload(globalParamValues);
cl.getNonbondedUtilities().addArgument(OpenCLNonbondedUtilities::ParameterInfo(prefix+"globals", "float", sizeof(cl_float), globals->getDeviceBuffer()));
cl.getNonbondedUtilities().addArgument(OpenCLNonbondedUtilities::ParameterInfo(prefix+"globals", "float", 1, sizeof(cl_float), globals->getDeviceBuffer()));
}
cl.addForce(new OpenCLCustomNonbondedForceInfo(cl.getNonbondedUtilities().getNumForceBuffers(), force));
}
......@@ -1559,8 +1559,8 @@ void OpenCLCalcGBSAOBCForceKernel::initialize(const System& system, const GBSAOB
bool usePeriodic = (force.getNonbondedMethod() != GBSAOBCForce::NoCutoff && force.getNonbondedMethod() != GBSAOBCForce::CutoffNonPeriodic);
string source = OpenCLKernelSources::gbsaObc2;
nb.addInteraction(useCutoff, usePeriodic, false, force.getCutoffDistance(), vector<vector<int> >(), source);
nb.addParameter(OpenCLNonbondedUtilities::ParameterInfo("obcParams", "float2", sizeof(cl_float2), params->getDeviceBuffer()));;
nb.addParameter(OpenCLNonbondedUtilities::ParameterInfo("bornForce", "float", sizeof(cl_float), bornForce->getDeviceBuffer()));;
nb.addParameter(OpenCLNonbondedUtilities::ParameterInfo("obcParams", "float", 2, sizeof(cl_float2), params->getDeviceBuffer()));;
nb.addParameter(OpenCLNonbondedUtilities::ParameterInfo("bornForce", "float", 1, sizeof(cl_float), bornForce->getDeviceBuffer()));;
cl.addForce(new OpenCLGBSAOBCForceInfo(nb.getNumForceBuffers(), force));
}
......@@ -1589,42 +1589,39 @@ void OpenCLCalcGBSAOBCForceKernel::executeForces(ContextImpl& context) {
defines["PADDED_NUM_ATOMS"] = intToString(cl.getPaddedNumAtoms());
string file = (cl.getSIMDWidth() == 32 ? OpenCLKernelSources::gbsaObc_nvidia : OpenCLKernelSources::gbsaObc_default);
cl::Program program = cl.createProgram(file, defines);
int index = 0;
computeBornSumKernel = cl::Kernel(program, "computeBornSum");
computeBornSumKernel.setArg<cl::Buffer>(0, bornSum->getDeviceBuffer());
computeBornSumKernel.setArg(1, OpenCLContext::ThreadBlockSize*sizeof(cl_float), NULL);
computeBornSumKernel.setArg<cl::Buffer>(2, cl.getPosq().getDeviceBuffer());
computeBornSumKernel.setArg(3, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL);
computeBornSumKernel.setArg<cl::Buffer>(4, params->getDeviceBuffer());
computeBornSumKernel.setArg(5, OpenCLContext::ThreadBlockSize*sizeof(cl_float2), NULL);
computeBornSumKernel.setArg(6, OpenCLContext::ThreadBlockSize*sizeof(cl_float), NULL);
computeBornSumKernel.setArg<cl::Buffer>(index++, bornSum->getDeviceBuffer());
computeBornSumKernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer());
computeBornSumKernel.setArg<cl::Buffer>(index++, params->getDeviceBuffer());
computeBornSumKernel.setArg(index++, OpenCLContext::ThreadBlockSize*13*sizeof(cl_float), NULL);
computeBornSumKernel.setArg(index++, OpenCLContext::ThreadBlockSize*sizeof(cl_float), NULL);
if (nb.getUseCutoff()) {
computeBornSumKernel.setArg<cl::Buffer>(7, nb.getInteractingTiles().getDeviceBuffer());
computeBornSumKernel.setArg<cl::Buffer>(8, nb.getInteractionFlags().getDeviceBuffer());
computeBornSumKernel.setArg<cl::Buffer>(9, nb.getInteractionCount().getDeviceBuffer());
computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getInteractionFlags().getDeviceBuffer());
computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
}
else {
computeBornSumKernel.setArg<cl::Buffer>(7, nb.getTiles().getDeviceBuffer());
computeBornSumKernel.setArg<cl_uint>(8, nb.getTiles().getSize());
computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getTiles().getDeviceBuffer());
computeBornSumKernel.setArg<cl_uint>(index++, nb.getTiles().getSize());
}
force1Kernel = cl::Kernel(program, "computeGBSAForce1");
force1Kernel.setArg<cl::Buffer>(0, cl.getForceBuffers().getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(1, cl.getEnergyBuffer().getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(2, cl.getPosq().getDeviceBuffer());
force1Kernel.setArg(3, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL);
force1Kernel.setArg(4, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL);
force1Kernel.setArg<cl::Buffer>(5, bornRadii->getDeviceBuffer());
force1Kernel.setArg(6, OpenCLContext::ThreadBlockSize*sizeof(cl_float), NULL);
force1Kernel.setArg<cl::Buffer>(7, bornForce->getDeviceBuffer());
force1Kernel.setArg(8, OpenCLContext::ThreadBlockSize*sizeof(cl_float), NULL);
force1Kernel.setArg(9, OpenCLContext::ThreadBlockSize*sizeof(mm_float4), NULL);
index = 0;
force1Kernel.setArg<cl::Buffer>(index++, cl.getForceBuffers().getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(index++, cl.getEnergyBuffer().getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(index++, bornRadii->getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(index++, bornForce->getDeviceBuffer());
force1Kernel.setArg(index++, OpenCLContext::ThreadBlockSize*13*sizeof(cl_float), NULL);
force1Kernel.setArg(index++, OpenCLContext::ThreadBlockSize*sizeof(mm_float4), NULL);
if (nb.getUseCutoff()) {
force1Kernel.setArg<cl::Buffer>(10, nb.getInteractingTiles().getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(11, nb.getInteractionFlags().getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(12, nb.getInteractionCount().getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(index++, nb.getInteractionFlags().getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
}
else {
force1Kernel.setArg<cl::Buffer>(10, nb.getTiles().getDeviceBuffer());
force1Kernel.setArg<cl_uint>(11, nb.getTiles().getSize());
force1Kernel.setArg<cl::Buffer>(index++, nb.getTiles().getDeviceBuffer());
force1Kernel.setArg<cl_uint>(index++, nb.getTiles().getSize());
}
program = cl.createProgram(OpenCLKernelSources::gbsaObcReductions, defines);
reduceBornSumKernel = cl::Kernel(program, "reduceBornSum");
......@@ -1773,13 +1770,13 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
vector<mm_float4> f = OpenCLExpressionUtilities::computeFunctionCoefficients(values, interpolating);
tabulatedFunctions.push_back(new OpenCLArray<mm_float4>(cl, values.size()-1, "TabulatedFunction"));
tabulatedFunctions[tabulatedFunctions.size()-1]->upload(f);
cl.getNonbondedUtilities().addArgument(OpenCLNonbondedUtilities::ParameterInfo(arrayName, "float4", sizeof(cl_float4), tabulatedFunctions[tabulatedFunctions.size()-1]->getDeviceBuffer()));
cl.getNonbondedUtilities().addArgument(OpenCLNonbondedUtilities::ParameterInfo(arrayName, "float", 4, sizeof(cl_float4), tabulatedFunctions[tabulatedFunctions.size()-1]->getDeviceBuffer()));
tableArgs << ", __global float4* " << arrayName;
}
if (force.getNumFunctions() > 0) {
tabulatedFunctionParams = new OpenCLArray<mm_float4>(cl, tabulatedFunctionParamsVec.size(), "tabulatedFunctionParameters", false, CL_MEM_READ_ONLY);
tabulatedFunctionParams->upload(tabulatedFunctionParamsVec);
cl.getNonbondedUtilities().addArgument(OpenCLNonbondedUtilities::ParameterInfo(prefix+"functionParams", "float4", sizeof(cl_float4), tabulatedFunctionParams->getDeviceBuffer()));
cl.getNonbondedUtilities().addArgument(OpenCLNonbondedUtilities::ParameterInfo(prefix+"functionParams", "float", 4, sizeof(cl_float4), tabulatedFunctionParams->getDeviceBuffer()));
tableArgs << ", __constant float4* " << prefix << "functionParams";
}
......@@ -2162,21 +2159,21 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
for (int i = 0; i < (int) params->getBuffers().size(); i++) {
const OpenCLNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i];
string paramName = prefix+"params"+intToString(i+1);
cl.getNonbondedUtilities().addParameter(OpenCLNonbondedUtilities::ParameterInfo(paramName, buffer.getType(), buffer.getSize(), buffer.getMemory()));
cl.getNonbondedUtilities().addParameter(OpenCLNonbondedUtilities::ParameterInfo(paramName, buffer.getComponentType(), buffer.getNumComponents(), buffer.getSize(), buffer.getMemory()));
}
for (int i = 0; i < (int) computedValues->getBuffers().size(); i++) {
const OpenCLNonbondedUtilities::ParameterInfo& buffer = computedValues->getBuffers()[i];
string paramName = prefix+"values"+intToString(i+1);
cl.getNonbondedUtilities().addParameter(OpenCLNonbondedUtilities::ParameterInfo(paramName, buffer.getType(), buffer.getSize(), buffer.getMemory()));
cl.getNonbondedUtilities().addParameter(OpenCLNonbondedUtilities::ParameterInfo(paramName, buffer.getComponentType(), buffer.getNumComponents(), buffer.getSize(), buffer.getMemory()));
}
for (int i = 0; i < (int) energyDerivs->getBuffers().size(); i++) {
const OpenCLNonbondedUtilities::ParameterInfo& buffer = energyDerivs->getBuffers()[i];
string paramName = prefix+"dEdV"+intToString(i+1);
cl.getNonbondedUtilities().addParameter(OpenCLNonbondedUtilities::ParameterInfo(paramName, buffer.getType(), buffer.getSize(), buffer.getMemory()));
cl.getNonbondedUtilities().addParameter(OpenCLNonbondedUtilities::ParameterInfo(paramName, buffer.getComponentType(), buffer.getNumComponents(), buffer.getSize(), buffer.getMemory()));
}
if (globals != NULL) {
globals->upload(globalParamValues);
cl.getNonbondedUtilities().addArgument(OpenCLNonbondedUtilities::ParameterInfo(prefix+"globals", "float", sizeof(cl_float), globals->getDeviceBuffer()));
cl.getNonbondedUtilities().addArgument(OpenCLNonbondedUtilities::ParameterInfo(prefix+"globals", "float", 1, sizeof(cl_float), globals->getDeviceBuffer()));
}
}
cl.addForce(new OpenCLCustomGBForceInfo(cl.getNonbondedUtilities().getNumForceBuffers(), force));
......
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Portions copyright (c) 2009-2010 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -283,9 +283,15 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
map<string, string> replacements;
replacements["COMPUTE_INTERACTION"] = source;
int localDataSize = 7*sizeof(cl_float);
const string suffixes[] = {"x", "y", "z", "w"};
stringstream localData;
for (int i = 0; i < (int) params.size(); i++) {
localData << params[i].getType() << " " << params[i].getName() << ";\n";
if (params[i].getNumComponents() == 1)
localData<<params[i].getType()<<" "<<params[i].getName()<<";\n";
else {
for (int j = 0; j < params[i].getNumComponents(); ++j)
localData<<params[i].getComponentType()<<" "<<params[i].getName()<<"_"<<suffixes[j]<<";\n";
}
localDataSize += params[i].getSize();
}
if ((localDataSize/4)%2 == 0) {
......@@ -318,20 +324,25 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
replacements["PARAMETER_ARGUMENTS"] = args.str();
stringstream loadLocal1;
for (int i = 0; i < (int) params.size(); i++) {
loadLocal1 << "localData[get_local_id(0)].";
loadLocal1 << params[i].getName();
loadLocal1 << " = ";
loadLocal1 << params[i].getName();
loadLocal1 << "1;\n";
if (params[i].getNumComponents() == 1) {
loadLocal1<<"localData[get_local_id(0)]."<<params[i].getName()<<" = "<<params[i].getName()<<"1;\n";
}
else {
for (int j = 0; j < params[i].getNumComponents(); ++j)
loadLocal1<<"localData[get_local_id(0)]."<<params[i].getName()<<"_"<<suffixes[j]<<" = "<<params[i].getName()<<"1."<<suffixes[j]<<";\n";
}
}
replacements["LOAD_LOCAL_PARAMETERS_FROM_1"] = loadLocal1.str();
stringstream loadLocal2;
for (int i = 0; i < (int) params.size(); i++) {
loadLocal2 << "localData[get_local_id(0)].";
loadLocal2 << params[i].getName();
loadLocal2 << " = global_";
loadLocal2 << params[i].getName();
loadLocal2 << "[j];\n";
if (params[i].getNumComponents() == 1) {
loadLocal2<<"localData[get_local_id(0)]."<<params[i].getName()<<" = global_"<<params[i].getName()<<"[j];\n";
}
else {
loadLocal2<<params[i].getType()<<" temp_"<<params[i].getName()<<" = global_"<<params[i].getName()<<"[j];\n";
for (int j = 0; j < params[i].getNumComponents(); ++j)
loadLocal2<<"localData[get_local_id(0)]."<<params[i].getName()<<"_"<<suffixes[j]<<" = temp_"<<params[i].getName()<<"."<<suffixes[j]<<";\n";
}
}
replacements["LOAD_LOCAL_PARAMETERS_FROM_GLOBAL"] = loadLocal2.str();
stringstream load1;
......@@ -346,12 +357,18 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
replacements["LOAD_ATOM1_PARAMETERS"] = load1.str();
stringstream load2j;
for (int i = 0; i < (int) params.size(); i++) {
load2j << params[i].getType();
load2j << " ";
load2j << params[i].getName();
load2j << "2 = localData[atom2].";
load2j << params[i].getName();
load2j << ";\n";
if (params[i].getNumComponents() == 1) {
load2j<<params[i].getType()<<" "<<params[i].getName()<<"2 = localData[atom2]."<<params[i].getName()<<";\n";
}
else {
load2j<<params[i].getType()<<" "<<params[i].getName()<<"2 = ("<<params[i].getType()<<") (";
for (int j = 0; j < params[i].getNumComponents(); ++j) {
if (j > 0)
load2j<<", ";
load2j<<"localData[atom2]."<<params[i].getName()<<"_"<<suffixes[j];
}
load2j<<");\n";
}
}
replacements["LOAD_ATOM2_PARAMETERS"] = load2j.str();
map<string, string> defines;
......
......@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Portions copyright (c) 2009-2010 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -29,6 +29,7 @@
#include "OpenCLContext.h"
#include "openmm/System.h"
#include "OpenCLExpressionUtilities.h"
#include <string>
#include <vector>
......@@ -228,20 +229,31 @@ public:
/**
* Create a ParameterInfo object.
*
* @param name the name of the parameter
* @param type the data type of the parameter
* @param size the size of the parameter in bytes
* @param memory the memory containing the parameter values
* @param name the name of the parameter
* @param type the data type of the parameter's components
* @param numComponents the number of components in the parameter
* @param size the size of the parameter in bytes
* @param memory the memory containing the parameter values
*/
ParameterInfo(const std::string& name, const std::string& type, int size, cl::Memory& memory) :
name(name), type(type), size(size), memory(&memory) {
ParameterInfo(const std::string& name, const std::string& componentType, int numComponents, int size, cl::Memory& memory) :
name(name), componentType(componentType), numComponents(numComponents), size(size), memory(&memory) {
if (numComponents == 1)
type = componentType;
else
type = componentType+OpenCLExpressionUtilities::intToString(numComponents);
}
const std::string& getName() const {
return name;
}
const std::string& getComponentType() const {
return componentType;
}
const std::string& getType() const {
return type;
}
int getNumComponents() const {
return numComponents;
}
int getSize() const {
return size;
}
......@@ -250,8 +262,9 @@ public:
}
private:
std::string name;
std::string componentType;
std::string type;
int size;
int size, numComponents;
cl::Memory* memory;
};
......
......@@ -41,21 +41,21 @@ OpenCLParameterSet::OpenCLParameterSet(OpenCLContext& context, int numParameters
cl::Buffer* buf = new cl::Buffer(context.getContext(), CL_MEM_READ_WRITE, numObjects*sizeof(mm_float4));
std::stringstream name;
name << "param" << (++bufferCount);
buffers.push_back(OpenCLNonbondedUtilities::ParameterInfo(name.str(), "float4", sizeof(mm_float4), *buf));
buffers.push_back(OpenCLNonbondedUtilities::ParameterInfo(name.str(), "float", 4, sizeof(mm_float4), *buf));
params -= 4;
}
if (params > 1) {
cl::Buffer* buf = new cl::Buffer(context.getContext(), CL_MEM_READ_WRITE, numObjects*sizeof(mm_float2));
std::stringstream name;
name << "param" << (++bufferCount);
buffers.push_back(OpenCLNonbondedUtilities::ParameterInfo(name.str(), "float2", sizeof(mm_float2), *buf));
buffers.push_back(OpenCLNonbondedUtilities::ParameterInfo(name.str(), "float", 2, sizeof(mm_float2), *buf));
params -= 2;
}
if (params > 0) {
cl::Buffer* buf = new cl::Buffer(context.getContext(), CL_MEM_READ_WRITE, numObjects*sizeof(cl_float));
std::stringstream name;
name << "param" << (++bufferCount);
buffers.push_back(OpenCLNonbondedUtilities::ParameterInfo(name.str(), "float", sizeof(cl_float), *buf));
buffers.push_back(OpenCLNonbondedUtilities::ParameterInfo(name.str(), "float", 1, sizeof(cl_float), *buf));
}
}
catch (cl::Error err) {
......
#define TILE_SIZE 32
typedef struct {
float x, y, z;
float q;
float fx, fy, fz, fw;
float radius, scaledRadius;
float bornSum;
float bornRadius;
float bornForce;
} AtomData;
/**
* Compute the Born sum.
*/
__kernel void computeBornSum(__global float* global_bornSum, __local float* local_bornSum, __global float4* posq,
__local float4* local_posq, __global float2* global_params, __local float2* local_params, __local float* tempBuffer, __global unsigned int* tiles,
__kernel void computeBornSum(__global float* global_bornSum, __global float4* posq, __global float2* global_params, __local AtomData* localData, __local float* tempBuffer, __global unsigned int* tiles,
#ifdef USE_CUTOFF
__global unsigned int* interactionFlags, __global unsigned int* interactionCount) {
#else
......@@ -34,13 +43,17 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
if (x == y) {
// This tile is on the diagonal.
local_posq[get_local_id(0)] = posq1;
local_params[get_local_id(0)] = params1;
localData[get_local_id(0)].x = posq1.x;
localData[get_local_id(0)].y = posq1.y;
localData[get_local_id(0)].z = posq1.z;
localData[get_local_id(0)].q = posq1.w;
localData[get_local_id(0)].radius = params1.x;
localData[get_local_id(0)].scaledRadius = params1.y;
barrier(CLK_LOCAL_MEM_FENCE);
unsigned int xi = x/TILE_SIZE;
unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*(xi+1)/2;
for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
float4 delta = (float4) (local_posq[baseLocalAtom+j].xyz - posq1.xyz, 0.0f);
float4 delta = (float4) (localData[baseLocalAtom+j].x-posq1.x, localData[baseLocalAtom+j].y-posq1.y, localData[baseLocalAtom+j].z-posq1.z, 0.0f);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*INV_PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
delta.y -= floor(delta.y*INV_PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y;
......@@ -54,7 +67,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
#endif
float invR = native_rsqrt(r2);
float r = native_recip(invR);
float2 params2 = local_params[baseLocalAtom+j];
float2 params2 = (float2) (localData[baseLocalAtom+j].radius, localData[baseLocalAtom+j].scaledRadius);
float rScaledRadiusJ = r+params2.y;
if ((j != tgx) && (params1.x < rScaledRadiusJ)) {
float l_ij = 1.0f/max(params1.x, fabs(r-params2.y));
......@@ -89,10 +102,16 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
if (lasty != y && get_local_id(0) < TILE_SIZE) {
unsigned int j = y + tgx;
local_posq[get_local_id(0)] = posq[j];
local_params[get_local_id(0)] = global_params[j];
float4 tempPosq = posq[j];
localData[get_local_id(0)].x = tempPosq.x;
localData[get_local_id(0)].y = tempPosq.y;
localData[get_local_id(0)].z = tempPosq.z;
localData[get_local_id(0)].q = tempPosq.w;
float2 tempParams = global_params[j];
localData[get_local_id(0)].radius = tempParams.x;
localData[get_local_id(0)].scaledRadius = tempParams.y;
}
local_bornSum[get_local_id(0)] = 0.0f;
localData[get_local_id(0)].bornSum = 0.0f;
barrier(CLK_LOCAL_MEM_FENCE);
// Compute the full set of interactions in this tile.
......@@ -102,7 +121,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
unsigned int tile = xi+yi*PADDED_NUM_ATOMS/TILE_SIZE-yi*(yi+1)/2;
unsigned int tj = tgx%(TILE_SIZE/2);
for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
float4 delta = (float4) (local_posq[baseLocalAtom+tj].xyz - posq1.xyz, 0.0f);
float4 delta = (float4) (localData[baseLocalAtom+tj].x-posq1.x, localData[baseLocalAtom+tj].y-posq1.y, localData[baseLocalAtom+tj].z-posq1.z, 0.0f);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*INV_PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
delta.y -= floor(delta.y*INV_PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y;
......@@ -116,7 +135,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
#endif
float invR = native_rsqrt(r2);
float r = native_recip(invR);
float2 params2 = local_params[baseLocalAtom+tj];
float2 params2 = (float2) (localData[baseLocalAtom+tj].radius, localData[baseLocalAtom+tj].scaledRadius);
float rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) {
float l_ij = 1.0f/max(params1.x, fabs(r-params2.y));
......@@ -140,7 +159,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
(0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2);
if (params2.x < params1.x-r)
term += 2.0f*(1.0f/params2.x-l_ij);
local_bornSum[baseLocalAtom+tj+forceBufferOffset] += term;
localData[baseLocalAtom+tj+forceBufferOffset].bornSum += term;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
......@@ -161,7 +180,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
unsigned int offset2 = y + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
global_bornSum[offset1] += bornSum+tempBuffer[get_local_id(0)+TILE_SIZE];
global_bornSum[offset2] += local_bornSum[get_local_id(0)]+local_bornSum[get_local_id(0)+TILE_SIZE];
global_bornSum[offset2] += localData[get_local_id(0)].bornSum+localData[get_local_id(0)+TILE_SIZE].bornSum;
}
lasty = y;
}
......@@ -174,8 +193,8 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
*/
__kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer,
__global float4* posq, __local float4* local_posq, __local float4* local_force, __global float* global_bornRadii, __local float* local_bornRadii,
__global float* global_bornForce, __local float* local_bornForce, __local float4* tempBuffer, __global unsigned int* tiles,
__global float4* posq, __global float* global_bornRadii,
__global float* global_bornForce, __local AtomData* localData, __local float4* tempBuffer, __global unsigned int* tiles,
#ifdef USE_CUTOFF
__global unsigned int* interactionFlags, __global unsigned int* interactionCount) {
#else
......@@ -204,14 +223,17 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
if (x == y) {
// This tile is on the diagonal.
local_posq[get_local_id(0)] = posq1;
local_bornRadii[get_local_id(0)] = bornRadius1;
localData[get_local_id(0)].x = posq1.x;
localData[get_local_id(0)].y = posq1.y;
localData[get_local_id(0)].z = posq1.z;
localData[get_local_id(0)].q = posq1.w;
localData[get_local_id(0)].bornRadius = bornRadius1;
barrier(CLK_LOCAL_MEM_FENCE);
unsigned int xi = x/TILE_SIZE;
unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*(xi+1)/2;
for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
if (atom1 < NUM_ATOMS && y+baseLocalAtom+j < NUM_ATOMS) {
float4 posq2 = local_posq[baseLocalAtom+j];
float4 posq2 = (float4) (localData[baseLocalAtom+j].x, localData[baseLocalAtom+j].y, localData[baseLocalAtom+j].z, localData[baseLocalAtom+j].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*INV_PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
......@@ -221,7 +243,7 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float invR = native_rsqrt(r2);
float r = native_recip(invR);
float bornRadius2 = local_bornRadii[baseLocalAtom+j];
float bornRadius2 = localData[baseLocalAtom+j].bornRadius;
float alpha2_ij = bornRadius1*bornRadius2;
float D_ij = r2/(4.0f*alpha2_ij);
float expTerm = exp(-D_ij);
......@@ -264,10 +286,17 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
if (lasty != y && get_local_id(0) < TILE_SIZE) {
unsigned int j = y + tgx;
local_posq[get_local_id(0)] = posq[j];
local_bornRadii[get_local_id(0)] = global_bornRadii[j];
float4 tempPosq = posq[j];
localData[get_local_id(0)].x = tempPosq.x;
localData[get_local_id(0)].y = tempPosq.y;
localData[get_local_id(0)].z = tempPosq.z;
localData[get_local_id(0)].q = tempPosq.w;
localData[get_local_id(0)].bornRadius = global_bornRadii[j];
}
local_force[get_local_id(0)] = 0.0f;
localData[get_local_id(0)].fx = 0.0f;
localData[get_local_id(0)].fy = 0.0f;
localData[get_local_id(0)].fz = 0.0f;
localData[get_local_id(0)].fw = 0.0f;
barrier(CLK_LOCAL_MEM_FENCE);
// Compute the full set of interactions in this tile.
......@@ -278,7 +307,7 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
unsigned int tj = tgx%(TILE_SIZE/2);
for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
if (atom1 < NUM_ATOMS && y+baseLocalAtom+tj < NUM_ATOMS) {
float4 posq2 = local_posq[baseLocalAtom+tj];
float4 posq2 = (float4) (localData[baseLocalAtom+tj].x, localData[baseLocalAtom+tj].y, localData[baseLocalAtom+tj].z, localData[baseLocalAtom+tj].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*INV_PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
......@@ -288,7 +317,7 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float invR = native_rsqrt(r2);
float r = native_recip(invR);
float bornRadius2 = local_bornRadii[baseLocalAtom+tj];
float bornRadius2 = localData[baseLocalAtom+tj].bornRadius;
float alpha2_ij = bornRadius1*bornRadius2;
float D_ij = r2/(4.0f*alpha2_ij);
float expTerm = exp(-D_ij);
......@@ -308,7 +337,10 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
energy += tempEnergy;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
local_force[baseLocalAtom+tj+forceBufferOffset] += (float4) (delta.xyz, dGpol_dalpha2_ij*bornRadius1);
localData[baseLocalAtom+tj+forceBufferOffset].fx += delta.x;
localData[baseLocalAtom+tj+forceBufferOffset].fy += delta.y;
localData[baseLocalAtom+tj+forceBufferOffset].fz += delta.z;
localData[baseLocalAtom+tj+forceBufferOffset].fw += dGpol_dalpha2_ij*bornRadius1;
}
barrier(CLK_LOCAL_MEM_FENCE);
tj = (tj+1)%(TILE_SIZE/2);
......@@ -328,9 +360,13 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
unsigned int offset2 = y + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
forceBuffers[offset1].xyz = forceBuffers[offset1].xyz+force.xyz+tempBuffer[get_local_id(0)+TILE_SIZE].xyz;
forceBuffers[offset2].xyz = forceBuffers[offset2].xyz+local_force[get_local_id(0)].xyz+local_force[get_local_id(0)+TILE_SIZE].xyz;
float4 sum = (float4) (localData[get_local_id(0)].fx+localData[get_local_id(0)+TILE_SIZE].fx,
localData[get_local_id(0)].fy+localData[get_local_id(0)+TILE_SIZE].fy,
localData[get_local_id(0)].fz+localData[get_local_id(0)+TILE_SIZE].fz,
localData[get_local_id(0)].fw+localData[get_local_id(0)+TILE_SIZE].fw);
forceBuffers[offset2].xyz = forceBuffers[offset2].xyz+sum.xyz;
global_bornForce[offset1] += force.w+tempBuffer[get_local_id(0)+TILE_SIZE].w;
global_bornForce[offset2] += local_force[get_local_id(0)].w+local_force[get_local_id(0)+TILE_SIZE].w;
global_bornForce[offset2] += sum.w;
}
lasty = y;
}
......
#define TILE_SIZE 32
typedef struct {
float x, y, z;
float q;
float fx, fy, fz, fw;
float radius, scaledRadius;
float bornSum;
float bornRadius;
float bornForce;
} AtomData;
/**
* Compute the Born sum.
*/
__kernel void computeBornSum(__global float* global_bornSum, __local float* local_bornSum, __global float4* posq,
__local float4* local_posq, __global float2* global_params, __local float2* local_params, __local float* tempBuffer, __global unsigned int* tiles,
__kernel void computeBornSum(__global float* global_bornSum, __global float4* posq, __global float2* global_params, __local AtomData* localData, __local float* tempBuffer, __global unsigned int* tiles,
#ifdef USE_CUTOFF
__global unsigned int* interactionFlags, __global unsigned int* interactionCount) {
#else
......@@ -35,12 +44,16 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
if (x == y) {
// This tile is on the diagonal.
local_posq[get_local_id(0)] = posq1;
local_params[get_local_id(0)] = params1;
localData[get_local_id(0)].x = posq1.x;
localData[get_local_id(0)].y = posq1.y;
localData[get_local_id(0)].z = posq1.z;
localData[get_local_id(0)].q = posq1.w;
localData[get_local_id(0)].radius = params1.x;
localData[get_local_id(0)].scaledRadius = params1.y;
unsigned int xi = x/TILE_SIZE;
unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*(xi+1)/2;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
float4 delta = (float4) (local_posq[tbx+j].xyz - posq1.xyz, 0.0f);
float4 delta = (float4) (localData[tbx+j].x-posq1.x, localData[tbx+j].y-posq1.y, localData[tbx+j].z-posq1.z, 0.0f);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*INV_PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
delta.y -= floor(delta.y*INV_PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y;
......@@ -54,7 +67,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
#endif
float invR = native_rsqrt(r2);
float r = native_recip(invR);
float2 params2 = local_params[tbx+j];
float2 params2 = (float2) (localData[tbx+j].radius, localData[tbx+j].scaledRadius);
float rScaledRadiusJ = r+params2.y;
if ((j != tgx) && (params1.x < rScaledRadiusJ)) {
float l_ij = 1.0f/max(params1.x, fabs(r-params2.y));
......@@ -83,10 +96,16 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
if (lasty != y) {
unsigned int j = y + tgx;
local_posq[get_local_id(0)] = posq[j];
local_params[get_local_id(0)] = global_params[j];
float4 tempPosq = posq[j];
localData[get_local_id(0)].x = tempPosq.x;
localData[get_local_id(0)].y = tempPosq.y;
localData[get_local_id(0)].z = tempPosq.z;
localData[get_local_id(0)].q = tempPosq.w;
float2 tempParams = global_params[j];
localData[get_local_id(0)].radius = tempParams.x;
localData[get_local_id(0)].scaledRadius = tempParams.y;
}
local_bornSum[get_local_id(0)] = 0.0f;
localData[get_local_id(0)].bornSum = 0.0f;
#ifdef USE_CUTOFF
unsigned int flags = interactionFlags[pos];
if (flags != 0xFFFFFFFF) {
......@@ -98,7 +117,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) {
float4 delta = (float4) (local_posq[tbx+j].xyz - posq1.xyz, 0.0f);
float4 delta = (float4) (localData[tbx+j].x-posq1.x, localData[tbx+j].y-posq1.y, localData[tbx+j].z-posq1.z, 0.0f);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*INV_PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
delta.y -= floor(delta.y*INV_PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y;
......@@ -113,7 +132,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
#endif
float invR = native_rsqrt(r2);
float r = native_recip(invR);
float2 params2 = local_params[tbx+j];
float2 params2 = (float2) (localData[tbx+j].radius, localData[tbx+j].scaledRadius);
float rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) {
float l_ij = 1.0f/max(params1.x, fabs(r-params2.y));
......@@ -152,7 +171,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
if (tgx % 16 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+8];
if (tgx == 0)
local_bornSum[tbx+j] += tempBuffer[get_local_id(0)] + tempBuffer[get_local_id(0)+16];
localData[tbx+j].bornSum += tempBuffer[get_local_id(0)] + tempBuffer[get_local_id(0)+16];
}
}
}
......@@ -167,7 +186,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
unsigned int tile = xi+yi*PADDED_NUM_ATOMS/TILE_SIZE-yi*(yi+1)/2;
unsigned int tj = tgx;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
float4 delta = (float4) (local_posq[tbx+tj].xyz - posq1.xyz, 0.0f);
float4 delta = (float4) (localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z, 0.0f);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*INV_PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
delta.y -= floor(delta.y*INV_PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y;
......@@ -181,7 +200,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
#endif
float invR = native_rsqrt(r2);
float r = native_recip(invR);
float2 params2 = local_params[tbx+tj];
float2 params2 = (float2) (localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
float rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) {
float l_ij = 1.0f/max(params1.x, fabs(r-params2.y));
......@@ -205,7 +224,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
(0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2);
if (params2.x < params1.x-r)
term += 2.0f*(1.0f/params2.x-l_ij);
local_bornSum[tbx+tj] += term;
localData[tbx+tj].bornSum += term;
}
}
tj = (tj + 1) & (TILE_SIZE - 1);
......@@ -222,7 +241,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
unsigned int offset2 = y + tgx + warp*PADDED_NUM_ATOMS;
#endif
global_bornSum[offset1] += bornSum;
global_bornSum[offset2] += local_bornSum[get_local_id(0)];
global_bornSum[offset2] += localData[get_local_id(0)].bornSum;
lasty = y;
}
pos++;
......@@ -234,8 +253,8 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
*/
__kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer,
__global float4* posq, __local float4* local_posq, __local float4* local_force, __global float* global_bornRadii, __local float* local_bornRadii,
__global float* global_bornForce, __local float* local_bornForce, __local float4* tempBuffer, __global unsigned int* tiles,
__global float4* posq, __global float* global_bornRadii,
__global float* global_bornForce, __local AtomData* localData, __local float4* tempBuffer, __global unsigned int* tiles,
#ifdef USE_CUTOFF
__global unsigned int* interactionFlags, __global unsigned int* interactionCount) {
#else
......@@ -265,13 +284,16 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
if (x == y) {
// This tile is on the diagonal.
local_posq[get_local_id(0)] = posq1;
local_bornRadii[get_local_id(0)] = bornRadius1;
localData[get_local_id(0)].x = posq1.x;
localData[get_local_id(0)].y = posq1.y;
localData[get_local_id(0)].z = posq1.z;
localData[get_local_id(0)].q = posq1.w;
localData[get_local_id(0)].bornRadius = bornRadius1;
unsigned int xi = x/TILE_SIZE;
unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*(xi+1)/2;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if (atom1 < NUM_ATOMS && y+j < NUM_ATOMS) {
float4 posq2 = local_posq[tbx+j];
float4 posq2 = (float4) (localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z, localData[tbx+j].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*INV_PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
......@@ -281,7 +303,7 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float invR = native_rsqrt(r2);
float r = native_recip(invR);
float bornRadius2 = local_bornRadii[tbx+j];
float bornRadius2 = localData[tbx+j].bornRadius;
float alpha2_ij = bornRadius1*bornRadius2;
float D_ij = r2/(4.0f*alpha2_ij);
float expTerm = exp(-D_ij);
......@@ -318,10 +340,17 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
if (lasty != y) {
unsigned int j = y + tgx;
local_posq[get_local_id(0)] = posq[j];
local_bornRadii[get_local_id(0)] = global_bornRadii[j];
float4 tempPosq = posq[j];
localData[get_local_id(0)].x = tempPosq.x;
localData[get_local_id(0)].y = tempPosq.y;
localData[get_local_id(0)].z = tempPosq.z;
localData[get_local_id(0)].q = tempPosq.w;
localData[get_local_id(0)].bornRadius = global_bornRadii[j];
}
local_force[get_local_id(0)] = 0.0f;
localData[get_local_id(0)].fx = 0.0f;
localData[get_local_id(0)].fy = 0.0f;
localData[get_local_id(0)].fz = 0.0f;
localData[get_local_id(0)].fw = 0.0f;
#ifdef USE_CUTOFF
unsigned int flags = interactionFlags[pos];
if (flags != 0xFFFFFFFF) {
......@@ -333,7 +362,7 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) {
float4 posq2 = local_posq[tbx+j];
float4 posq2 = (float4) (localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z, localData[tbx+j].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*INV_PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
......@@ -343,7 +372,7 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float invR = native_rsqrt(r2);
float r = native_recip(invR);
float bornRadius2 = local_bornRadii[tbx+j];
float bornRadius2 = localData[tbx+j].bornRadius;
float alpha2_ij = bornRadius1*bornRadius2;
float D_ij = r2/(4.0f*alpha2_ij);
float expTerm = exp(-D_ij);
......@@ -377,8 +406,13 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+4];
if (tgx % 16 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+8];
if (tgx == 0)
local_force[tbx+j] += tempBuffer[get_local_id(0)] + tempBuffer[get_local_id(0)+16];
if (tgx == 0) {
float4 sum = tempBuffer[get_local_id(0)] + tempBuffer[get_local_id(0)+16];
localData[tbx+j].fx += sum.x;
localData[tbx+j].fy += sum.y;
localData[tbx+j].fz += sum.z;
localData[tbx+j].fw += sum.w;
}
}
}
}
......@@ -394,7 +428,7 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
unsigned int tj = tgx;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if (atom1 < NUM_ATOMS && y+tj < NUM_ATOMS) {
float4 posq2 = local_posq[tbx+tj];
float4 posq2 = (float4) (localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z, localData[tbx+tj].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*INV_PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
......@@ -404,7 +438,7 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float invR = native_rsqrt(r2);
float r = native_recip(invR);
float bornRadius2 = local_bornRadii[tbx+tj];
float bornRadius2 = localData[tbx+tj].bornRadius;
float alpha2_ij = bornRadius1*bornRadius2;
float D_ij = r2/(4.0f*alpha2_ij);
float expTerm = exp(-D_ij);
......@@ -424,7 +458,10 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
energy += tempEnergy;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
local_force[tbx+tj] += (float4) (delta.xyz, dGpol_dalpha2_ij*bornRadius1);
localData[tbx+tj].fx += delta.x;
localData[tbx+tj].fy += delta.y;
localData[tbx+tj].fz += delta.z;
localData[tbx+tj].fw += dGpol_dalpha2_ij*bornRadius1;
}
tj = (tj + 1) & (TILE_SIZE - 1);
}
......@@ -439,9 +476,9 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
unsigned int offset2 = y + tgx + warp*PADDED_NUM_ATOMS;
#endif
forceBuffers[offset1].xyz += force.xyz;
forceBuffers[offset2].xyz += local_force[get_local_id(0)].xyz;
forceBuffers[offset2] += (float4) (localData[get_local_id(0)].fx, localData[get_local_id(0)].fy, localData[get_local_id(0)].fz, 0);
global_bornForce[offset1] += force.w;
global_bornForce[offset2] += local_force[get_local_id(0)].w;
global_bornForce[offset2] += localData[get_local_id(0)].fw;
lasty = y;
}
pos++;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment