Commit d65992ea authored by Peter Eastman's avatar Peter Eastman
Browse files

Optimized nonbonded kernels by inlining lots of constant quantities

parent d3ec1d0c
......@@ -542,11 +542,13 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
if (useCutoff) {
double reactionFieldK = pow(force.getCutoffDistance(), -3.0)*(force.getReactionFieldDielectric()-1.0)/(2.0*force.getReactionFieldDielectric()+1.0);
double reactionFieldC = (1.0 / force.getCutoffDistance())*(3.0*force.getReactionFieldDielectric())/(2.0*force.getReactionFieldDielectric()+1.0);
char k[50], c[50];
sprintf(k, "%.8ef", reactionFieldK);
sprintf(c, "%.8ef", reactionFieldC);
defines["REACTION_FIELD_K"] = string(k);
defines["REACTION_FIELD_C"] = string(c);
stringstream k, c;
k.precision(8);
c.precision(8);
k << scientific << reactionFieldK << "f";
c << scientific << reactionFieldC << "f";
defines["REACTION_FIELD_K"] = k.str();
defines["REACTION_FIELD_C"] = c.str();
}
// if (force.getNonbondedMethod() != NonbondedForce::NoCutoff) {
// method = CUTOFF;
......@@ -838,73 +840,83 @@ void OpenCLCalcGBSAOBCForceKernel::executeForces(ContextImpl& context) {
defines["USE_CUTOFF"] = "1";
if (nb.getUsePeriodic())
defines["USE_PERIODIC"] = "1";
stringstream xsize, ysize, zsize, cutoffSquared, prefac;
xsize.precision(8);
ysize.precision(8);
zsize.precision(8);
cutoffSquared.precision(8);
prefac.precision(8);
xsize << scientific << nb.getPeriodicBoxSize().x << "f";
ysize << scientific << nb.getPeriodicBoxSize().y << "f";
zsize << scientific << nb.getPeriodicBoxSize().z << "f";
cutoffSquared << scientific << (nb.getCutoffDistance()*nb.getCutoffDistance()) << "f";
prefac << scientific << prefactor << "f";
defines["PERIODIC_BOX_SIZE_X"] = xsize.str();
defines["PERIODIC_BOX_SIZE_Y"] = ysize.str();
defines["PERIODIC_BOX_SIZE_Z"] = zsize.str();
defines["CUTOFF_SQUARED"] = cutoffSquared.str();
defines["PREFACTOR"] = prefac.str();
stringstream natom, padded;
natom << cl.getNumAtoms();
padded << cl.getPaddedNumAtoms();
defines["NUM_ATOMS"] = natom.str();
defines["PADDED_NUM_ATOMS"] = padded.str();
cl::Program program = cl.createProgram(cl.loadSourceFromFile("gbsaObc.cl"), defines);
computeBornSumKernel = cl::Kernel(program, "computeBornSum");
computeBornSumKernel.setArg<cl_int>(0, cl.getNumAtoms());
computeBornSumKernel.setArg<cl_int>(1, cl.getPaddedNumAtoms());
computeBornSumKernel.setArg<cl::Buffer>(2, bornSum->getDeviceBuffer());
computeBornSumKernel.setArg(3, OpenCLContext::ThreadBlockSize*sizeof(cl_float), NULL);
computeBornSumKernel.setArg<cl::Buffer>(4, cl.getPosq().getDeviceBuffer());
computeBornSumKernel.setArg(5, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL);
computeBornSumKernel.setArg<cl::Buffer>(6, params->getDeviceBuffer());
computeBornSumKernel.setArg(7, OpenCLContext::ThreadBlockSize*sizeof(cl_float2), NULL);
computeBornSumKernel.setArg<cl::Buffer>(0, bornSum->getDeviceBuffer());
computeBornSumKernel.setArg(1, OpenCLContext::ThreadBlockSize*sizeof(cl_float), NULL);
computeBornSumKernel.setArg<cl::Buffer>(2, cl.getPosq().getDeviceBuffer());
computeBornSumKernel.setArg(3, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL);
computeBornSumKernel.setArg<cl::Buffer>(4, params->getDeviceBuffer());
computeBornSumKernel.setArg(5, OpenCLContext::ThreadBlockSize*sizeof(cl_float2), NULL);
if (nb.getUseCutoff()) {
computeBornSumKernel.setArg<cl::Buffer>(8, nb.getInteractingTiles().getDeviceBuffer());
computeBornSumKernel.setArg<cl_float>(9, nb.getCutoffDistance()*nb.getCutoffDistance());
computeBornSumKernel.setArg<mm_float4>(10, nb.getPeriodicBoxSize());
computeBornSumKernel.setArg<cl::Buffer>(11, nb.getInteractionFlags().getDeviceBuffer());
computeBornSumKernel.setArg<cl::Buffer>(12, nb.getInteractionCount().getDeviceBuffer());
computeBornSumKernel.setArg(13, OpenCLContext::ThreadBlockSize*sizeof(cl_float), NULL);
computeBornSumKernel.setArg<cl::Buffer>(6, nb.getInteractingTiles().getDeviceBuffer());
computeBornSumKernel.setArg<cl::Buffer>(7, nb.getInteractionFlags().getDeviceBuffer());
computeBornSumKernel.setArg<cl::Buffer>(8, nb.getInteractionCount().getDeviceBuffer());
computeBornSumKernel.setArg(9, OpenCLContext::ThreadBlockSize*sizeof(cl_float), NULL);
}
else {
computeBornSumKernel.setArg<cl::Buffer>(8, nb.getTiles().getDeviceBuffer());
computeBornSumKernel.setArg<cl_uint>(9, nb.getTiles().getSize());
computeBornSumKernel.setArg<cl::Buffer>(6, nb.getTiles().getDeviceBuffer());
computeBornSumKernel.setArg<cl_uint>(7, nb.getTiles().getSize());
}
reduceBornSumKernel = cl::Kernel(program, "reduceBornSum");
reduceBornSumKernel.setArg<cl_int>(0, cl.getNumAtoms());
reduceBornSumKernel.setArg<cl_int>(1, cl.getPaddedNumAtoms());
reduceBornSumKernel.setArg<cl_int>(2, cl.getNumForceBuffers());
reduceBornSumKernel.setArg<cl_float>(3, 1.0f);
reduceBornSumKernel.setArg<cl_float>(4, 0.8f);
reduceBornSumKernel.setArg<cl_float>(5, 4.85f);
reduceBornSumKernel.setArg<cl::Buffer>(6, bornSum->getDeviceBuffer());
reduceBornSumKernel.setArg<cl::Buffer>(7, params->getDeviceBuffer());
reduceBornSumKernel.setArg<cl::Buffer>(8, bornRadii->getDeviceBuffer());
reduceBornSumKernel.setArg<cl::Buffer>(9, obcChain->getDeviceBuffer());
reduceBornSumKernel.setArg<cl_int>(0, cl.getPaddedNumAtoms());
reduceBornSumKernel.setArg<cl_int>(1, cl.getNumForceBuffers());
reduceBornSumKernel.setArg<cl_float>(2, 1.0f);
reduceBornSumKernel.setArg<cl_float>(3, 0.8f);
reduceBornSumKernel.setArg<cl_float>(4, 4.85f);
reduceBornSumKernel.setArg<cl::Buffer>(5, bornSum->getDeviceBuffer());
reduceBornSumKernel.setArg<cl::Buffer>(6, params->getDeviceBuffer());
reduceBornSumKernel.setArg<cl::Buffer>(7, bornRadii->getDeviceBuffer());
reduceBornSumKernel.setArg<cl::Buffer>(8, obcChain->getDeviceBuffer());
force1Kernel = cl::Kernel(program, "computeGBSAForce1");
force1Kernel.setArg<cl_int>(0, cl.getNumAtoms());
force1Kernel.setArg<cl_int>(1, cl.getPaddedNumAtoms());
force1Kernel.setArg<cl_float>(2, prefactor);
force1Kernel.setArg<cl::Buffer>(3, cl.getForceBuffers().getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(4, cl.getEnergyBuffer().getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(5, cl.getPosq().getDeviceBuffer());
force1Kernel.setArg(6, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL);
force1Kernel.setArg(7, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL);
force1Kernel.setArg<cl::Buffer>(8, bornRadii->getDeviceBuffer());
force1Kernel.setArg(9, OpenCLContext::ThreadBlockSize*sizeof(cl_float), NULL);
force1Kernel.setArg<cl::Buffer>(10, bornForce->getDeviceBuffer());
force1Kernel.setArg(11, OpenCLContext::ThreadBlockSize*sizeof(cl_float), NULL);
force1Kernel.setArg<cl::Buffer>(0, cl.getForceBuffers().getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(1, cl.getEnergyBuffer().getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(2, cl.getPosq().getDeviceBuffer());
force1Kernel.setArg(3, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL);
force1Kernel.setArg(4, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL);
force1Kernel.setArg<cl::Buffer>(5, bornRadii->getDeviceBuffer());
force1Kernel.setArg(6, OpenCLContext::ThreadBlockSize*sizeof(cl_float), NULL);
force1Kernel.setArg<cl::Buffer>(7, bornForce->getDeviceBuffer());
force1Kernel.setArg(8, OpenCLContext::ThreadBlockSize*sizeof(cl_float), NULL);
if (nb.getUseCutoff()) {
force1Kernel.setArg<cl::Buffer>(12, nb.getInteractingTiles().getDeviceBuffer());
force1Kernel.setArg<cl_float>(13, nb.getCutoffDistance()*nb.getCutoffDistance());
force1Kernel.setArg<mm_float4>(14, nb.getPeriodicBoxSize());
force1Kernel.setArg<cl::Buffer>(15, nb.getInteractionFlags().getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(16, nb.getInteractionCount().getDeviceBuffer());
force1Kernel.setArg(17, OpenCLContext::ThreadBlockSize*sizeof(mm_float4), NULL);
force1Kernel.setArg<cl::Buffer>(9, nb.getInteractingTiles().getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(10, nb.getInteractionFlags().getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(11, nb.getInteractionCount().getDeviceBuffer());
force1Kernel.setArg(12, OpenCLContext::ThreadBlockSize*sizeof(mm_float4), NULL);
}
else {
force1Kernel.setArg<cl::Buffer>(12, nb.getTiles().getDeviceBuffer());
force1Kernel.setArg<cl_uint>(13, nb.getTiles().getSize());
force1Kernel.setArg<cl::Buffer>(9, nb.getTiles().getDeviceBuffer());
force1Kernel.setArg<cl_uint>(10, nb.getTiles().getSize());
}
reduceBornForceKernel = cl::Kernel(program, "reduceBornForce");
reduceBornForceKernel.setArg<cl_int>(0, cl.getNumAtoms());
reduceBornForceKernel.setArg<cl_int>(1, cl.getPaddedNumAtoms());
reduceBornForceKernel.setArg<cl_int>(2, cl.getNumForceBuffers());
reduceBornForceKernel.setArg<cl::Buffer>(3, bornForce->getDeviceBuffer());
reduceBornForceKernel.setArg<cl::Buffer>(4, cl.getEnergyBuffer().getDeviceBuffer());
reduceBornForceKernel.setArg<cl::Buffer>(5, params->getDeviceBuffer());
reduceBornForceKernel.setArg<cl::Buffer>(6, bornRadii->getDeviceBuffer());
reduceBornForceKernel.setArg<cl::Buffer>(7, obcChain->getDeviceBuffer());
reduceBornForceKernel.setArg<cl_int>(0, cl.getPaddedNumAtoms());
reduceBornForceKernel.setArg<cl_int>(1, cl.getNumForceBuffers());
reduceBornForceKernel.setArg<cl::Buffer>(2, bornForce->getDeviceBuffer());
reduceBornForceKernel.setArg<cl::Buffer>(3, cl.getEnergyBuffer().getDeviceBuffer());
reduceBornForceKernel.setArg<cl::Buffer>(4, params->getDeviceBuffer());
reduceBornForceKernel.setArg<cl::Buffer>(5, bornRadii->getDeviceBuffer());
reduceBornForceKernel.setArg<cl::Buffer>(6, obcChain->getDeviceBuffer());
}
cl.clearBuffer(*bornSum);
cl.clearBuffer(*bornForce);
......
......@@ -327,33 +327,47 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
defines["USE_PERIODIC"] = "1";
if (useExclusions)
defines["USE_EXCLUSIONS"] = "1";
stringstream xsize, ysize, zsize, cutoffSquared;
xsize.precision(8);
ysize.precision(8);
zsize.precision(8);
cutoffSquared.precision(8);
xsize << scientific << periodicBoxSize.x << "f";
ysize << scientific << periodicBoxSize.y << "f";
zsize << scientific << periodicBoxSize.z << "f";
cutoffSquared << scientific << (cutoff*cutoff) << "f";
defines["PERIODIC_BOX_SIZE_X"] = xsize.str();
defines["PERIODIC_BOX_SIZE_Y"] = ysize.str();
defines["PERIODIC_BOX_SIZE_Z"] = zsize.str();
defines["CUTOFF_SQUARED"] = cutoffSquared.str();
stringstream natom, padded;
natom << context.getNumAtoms();
padded << context.getPaddedNumAtoms();
defines["NUM_ATOMS"] = natom.str();
defines["PADDED_NUM_ATOMS"] = padded.str();
cl::Program program = context.createProgram(context.loadSourceFromFile("nonbonded.cl", replacements), defines);
cl::Kernel kernel(program, "computeNonbonded");
// Set arguments to the Kernel.
kernel.setArg<cl_int>(0, context.getNumAtoms());
kernel.setArg<cl_int>(1, context.getPaddedNumAtoms());
kernel.setArg<cl::Buffer>(2, context.getForceBuffers().getDeviceBuffer());
kernel.setArg<cl::Buffer>(3, context.getEnergyBuffer().getDeviceBuffer());
kernel.setArg<cl::Buffer>(4, context.getPosq().getDeviceBuffer());
kernel.setArg<cl::Buffer>(5, exclusions->getDeviceBuffer());
kernel.setArg<cl::Buffer>(6, exclusionIndex->getDeviceBuffer());
kernel.setArg(7, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL);
kernel.setArg(8, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL);
int paramBase = 11;
kernel.setArg<cl::Buffer>(0, context.getForceBuffers().getDeviceBuffer());
kernel.setArg<cl::Buffer>(1, context.getEnergyBuffer().getDeviceBuffer());
kernel.setArg<cl::Buffer>(2, context.getPosq().getDeviceBuffer());
kernel.setArg<cl::Buffer>(3, exclusions->getDeviceBuffer());
kernel.setArg<cl::Buffer>(4, exclusionIndex->getDeviceBuffer());
kernel.setArg(5, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL);
kernel.setArg(6, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL);
int paramBase = 9;
if (useCutoff) {
paramBase = 15;
kernel.setArg<cl::Buffer>(9, interactingTiles->getDeviceBuffer());
kernel.setArg<cl_float>(10, cutoff*cutoff);
kernel.setArg<mm_float4>(11, periodicBoxSize);
kernel.setArg<cl::Buffer>(12, interactionFlags->getDeviceBuffer());
kernel.setArg<cl::Buffer>(13, interactionCount->getDeviceBuffer());
kernel.setArg(14, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL);
paramBase = 11;
kernel.setArg<cl::Buffer>(7, interactingTiles->getDeviceBuffer());
kernel.setArg<cl::Buffer>(8, interactionFlags->getDeviceBuffer());
kernel.setArg<cl::Buffer>(9, interactionCount->getDeviceBuffer());
kernel.setArg(10, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL);
}
else {
kernel.setArg<cl::Buffer>(9, tiles->getDeviceBuffer());
kernel.setArg<cl_uint>(10, tiles->getSize());
kernel.setArg<cl::Buffer>(7, tiles->getDeviceBuffer());
kernel.setArg<cl_uint>(8, tiles->getSize());
}
for (int i = 0; i < (int) params.size(); i++) {
kernel.setArg<cl::Buffer>(i*2+paramBase, params[i].getBuffer());
......
#ifdef USE_CUTOFF
if (!isExcluded && r2 < cutoffSquared) {
if (!isExcluded && r2 < CUTOFF_SQUARED) {
#else
if (!isExcluded) {
#endif
......
......@@ -7,10 +7,10 @@ const float surfaceAreaFactor = -6.0f*3.14159265358979323846f*0.0216f*1000.0f*0.
* Compute the Born sum.
*/
__kernel void computeBornSum(int numAtoms, int paddedNumAtoms, __global float* global_bornSum, __local float* local_bornSum, __global float4* posq,
__kernel void computeBornSum(__global float* global_bornSum, __local float* local_bornSum, __global float4* posq,
__local float4* local_posq, __global float2* global_params, __local float2* local_params, __global unsigned int* tiles,
#ifdef USE_CUTOFF
float cutoffSquared, float4 periodicBoxSize, __global unsigned int* interactionFlags, __global unsigned int* interactionCount, __local float* tempBuffer) {
__global unsigned int* interactionFlags, __global unsigned int* interactionCount, __local float* tempBuffer) {
#else
unsigned int numTiles) {
#endif
......@@ -42,19 +42,19 @@ __kernel void computeBornSum(int numAtoms, int paddedNumAtoms, __global float* g
local_posq[get_local_id(0)] = posq1;
local_params[get_local_id(0)] = params1;
unsigned int xi = x/TileSize;
unsigned int tile = xi+xi*paddedNumAtoms/TileSize-xi*(xi+1)/2;
unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TileSize-xi*(xi+1)/2;
for (unsigned int j = 0; j < TileSize; j++) {
float4 delta = (float4) (local_posq[tbx+j].xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x/periodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y/periodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z/periodicBoxSize.z+0.5f)*periodicBoxSize.z;
delta.x -= floor(delta.x/PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
delta.y -= floor(delta.y/PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y;
delta.z -= floor(delta.z/PERIODIC_BOX_SIZE_Z+0.5f)*PERIODIC_BOX_SIZE_Z;
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (atom1 < numAtoms && y+j < numAtoms && r2 < cutoffSquared) {
if (atom1 < NUM_ATOMS && y+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
if (atom1 < numAtoms && y+j < numAtoms) {
if (atom1 < NUM_ATOMS && y+j < NUM_ATOMS) {
#endif
float r = sqrt(r2);
float invR = 1.0f/r;
......@@ -76,9 +76,9 @@ __kernel void computeBornSum(int numAtoms, int paddedNumAtoms, __global float* g
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset = x + tgx + (x/TileSize)*paddedNumAtoms;
unsigned int offset = x + tgx + (x/TileSize)*PADDED_NUM_ATOMS;
#else
unsigned int offset = x + tgx + warp*paddedNumAtoms;
unsigned int offset = x + tgx + warp*PADDED_NUM_ATOMS;
#endif
global_bornSum[offset] += bornSum;
}
......@@ -104,16 +104,16 @@ __kernel void computeBornSum(int numAtoms, int paddedNumAtoms, __global float* g
if ((flags&(1<<j)) != 0) {
float4 delta = (float4) (local_posq[tbx+j].xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x/periodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y/periodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z/periodicBoxSize.z+0.5f)*periodicBoxSize.z;
delta.x -= floor(delta.x/PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
delta.y -= floor(delta.y/PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y;
delta.z -= floor(delta.z/PERIODIC_BOX_SIZE_Z+0.5f)*PERIODIC_BOX_SIZE_Z;
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
tempBuffer[get_local_id(0)] = 0.0f;
#ifdef USE_CUTOFF
if (atom1 < numAtoms && y+j < numAtoms && r2 < cutoffSquared) {
if (atom1 < NUM_ATOMS && y+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
if (atom1 < numAtoms && y+j < numAtoms) {
if (atom1 < NUM_ATOMS && y+j < NUM_ATOMS) {
#endif
float r = sqrt(r2);
float invR = 1.0f/r;
......@@ -168,19 +168,19 @@ __kernel void computeBornSum(int numAtoms, int paddedNumAtoms, __global float* g
unsigned int xi = x/TileSize;
unsigned int yi = y/TileSize;
unsigned int tile = xi+yi*paddedNumAtoms/TileSize-yi*(yi+1)/2;
unsigned int tile = xi+yi*PADDED_NUM_ATOMS/TileSize-yi*(yi+1)/2;
for (unsigned int j = 0; j < TileSize; j++) {
float4 delta = (float4) (local_posq[tbx+tj].xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x/periodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y/periodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z/periodicBoxSize.z+0.5f)*periodicBoxSize.z;
delta.x -= floor(delta.x/PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
delta.y -= floor(delta.y/PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y;
delta.z -= floor(delta.z/PERIODIC_BOX_SIZE_Z+0.5f)*PERIODIC_BOX_SIZE_Z;
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (atom1 < numAtoms && y+tj < numAtoms && r2 < cutoffSquared) {
if (atom1 < NUM_ATOMS && y+tj < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
if (atom1 < numAtoms && y+tj < numAtoms) {
if (atom1 < NUM_ATOMS && y+tj < NUM_ATOMS) {
#endif
float r = sqrt(r2);
float invR = 1.0f/r;
......@@ -218,11 +218,11 @@ __kernel void computeBornSum(int numAtoms, int paddedNumAtoms, __global float* g
// Write results
float4 of;
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset1 = x + tgx + (y/TileSize)*paddedNumAtoms;
unsigned int offset2 = y + tgx + (x/TileSize)*paddedNumAtoms;
unsigned int offset1 = x + tgx + (y/TileSize)*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + (x/TileSize)*PADDED_NUM_ATOMS;
#else
unsigned int offset1 = x + tgx + warp*paddedNumAtoms;
unsigned int offset2 = y + tgx + warp*paddedNumAtoms;
unsigned int offset1 = x + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + warp*PADDED_NUM_ATOMS;
#endif
global_bornSum[offset1] += bornSum;
global_bornSum[offset2] += local_bornSum[get_local_id(0)];
......@@ -236,10 +236,10 @@ __kernel void computeBornSum(int numAtoms, int paddedNumAtoms, __global float* g
* Reduce the Born sums to compute the Born radii.
*/
__kernel void reduceBornSum(int numAtoms, int bufferSize, int numBuffers, float alpha, float beta, float gamma,
__kernel void reduceBornSum(int bufferSize, int numBuffers, float alpha, float beta, float gamma,
__global float* bornSum, __global float2* params, __global float* bornRadii, __global float* obcChain) {
unsigned int index = get_global_id(0);
while (index < numAtoms) {
while (index < NUM_ATOMS) {
// Get summed Born data
int totalSize = bufferSize*numBuffers;
......@@ -268,11 +268,11 @@ __kernel void reduceBornSum(int numAtoms, int bufferSize, int numBuffers, float
* First part of computing the GBSA interaction.
*/
__kernel void computeGBSAForce1(int numAtoms, int paddedNumAtoms, float prefactor, __global float4* forceBuffers, __global float* energyBuffer,
__kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer,
__global float4* posq, __local float4* local_posq, __local float4* local_force, __global float* global_bornRadii, __local float* local_bornRadii,
__global float* global_bornForce, __local float* local_bornForce, __global unsigned int* tiles,
#ifdef USE_CUTOFF
float cutoffSquared, float4 periodicBoxSize, __global unsigned int* interactionFlags, __global unsigned int* interactionCount, __local float4* tempBuffer) {
__global unsigned int* interactionFlags, __global unsigned int* interactionCount, __local float4* tempBuffer) {
#else
unsigned int numTiles) {
#endif
......@@ -304,14 +304,14 @@ __kernel void computeGBSAForce1(int numAtoms, int paddedNumAtoms, float prefacto
local_posq[get_local_id(0)] = posq1;
local_bornRadii[get_local_id(0)] = bornRadius1;
unsigned int xi = x/TileSize;
unsigned int tile = xi+xi*paddedNumAtoms/TileSize-xi*(xi+1)/2;
unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TileSize-xi*(xi+1)/2;
for (unsigned int j = 0; j < TileSize; j++) {
if (atom1 < numAtoms && y+j < numAtoms) {
if (atom1 < NUM_ATOMS && y+j < NUM_ATOMS) {
float4 delta = (float4) (local_posq[tbx+j].xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x/periodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y/periodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z/periodicBoxSize.z+0.5f)*periodicBoxSize.z;
delta.x -= floor(delta.x/PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
delta.y -= floor(delta.y/PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y;
delta.z -= floor(delta.z/PERIODIC_BOX_SIZE_Z+0.5f)*PERIODIC_BOX_SIZE_Z;
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float r = sqrt(r2);
......@@ -323,13 +323,13 @@ __kernel void computeGBSAForce1(int numAtoms, int paddedNumAtoms, float prefacto
float expTerm = exp(-D_ij);
float denominator2 = r2 + alpha2_ij*expTerm;
float denominator = sqrt(denominator2);
float tempEnergy = (prefactor*posq1.w*posq2.w)/denominator;
float tempEnergy = (PREFACTOR*posq1.w*posq2.w)/denominator;
float Gpol = tempEnergy/denominator2;
float dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
force.w += dGpol_dalpha2_ij*bornRadius2;
float dEdR = Gpol*(1.0f - 0.25f*expTerm);
#ifdef USE_CUTOFF
if (r2 > cutoffSquared) {
if (r2 > CUTOFF_SQUARED) {
dEdR = 0.0f;
tempEnergy = 0.0f;
}
......@@ -342,9 +342,9 @@ __kernel void computeGBSAForce1(int numAtoms, int paddedNumAtoms, float prefacto
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset = x + tgx + (x/TileSize)*paddedNumAtoms;
unsigned int offset = x + tgx + (x/TileSize)*PADDED_NUM_ATOMS;
#else
unsigned int offset = x + tgx + warp*paddedNumAtoms;
unsigned int offset = x + tgx + warp*PADDED_NUM_ATOMS;
#endif
forceBuffers[offset].xyz += force.xyz;
global_bornForce[offset] += force.w;
......@@ -371,9 +371,9 @@ __kernel void computeGBSAForce1(int numAtoms, int paddedNumAtoms, float prefacto
if ((flags&(1<<j)) != 0) {
float4 delta = (float4) (local_posq[tbx+j].xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x/periodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y/periodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z/periodicBoxSize.z+0.5f)*periodicBoxSize.z;
delta.x -= floor(delta.x/PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
delta.y -= floor(delta.y/PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y;
delta.z -= floor(delta.z/PERIODIC_BOX_SIZE_Z+0.5f)*PERIODIC_BOX_SIZE_Z;
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float r = sqrt(r2);
......@@ -385,15 +385,15 @@ __kernel void computeGBSAForce1(int numAtoms, int paddedNumAtoms, float prefacto
float expTerm = exp(-D_ij);
float denominator2 = r2 + alpha2_ij*expTerm;
float denominator = sqrt(denominator2);
float tempEnergy = (prefactor*posq1.w*posq2.w)/denominator;
float tempEnergy = (PREFACTOR*posq1.w*posq2.w)/denominator;
float Gpol = tempEnergy/denominator2;
float dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
force.w += dGpol_dalpha2_ij*bornRadius2;
float dEdR = Gpol*(1.0f - 0.25f*expTerm);
#ifdef USE_CUTOFF
if (atom1 >= numAtoms || y+j >= numAtoms || r2 > cutoffSquared) {
if (atom1 >= NUM_ATOMS || y+j >= NUM_ATOMS || r2 > CUTOFF_SQUARED) {
#else
if (atom1 >= numAtoms || y+j >= numAtoms) {
if (atom1 >= NUM_ATOMS || y+j >= NUM_ATOMS) {
#endif
dEdR = 0.0f;
tempEnergy = 0.0f;
......@@ -426,14 +426,14 @@ __kernel void computeGBSAForce1(int numAtoms, int paddedNumAtoms, float prefacto
unsigned int xi = x/TileSize;
unsigned int yi = y/TileSize;
unsigned int tile = xi+yi*paddedNumAtoms/TileSize-yi*(yi+1)/2;
unsigned int tile = xi+yi*PADDED_NUM_ATOMS/TileSize-yi*(yi+1)/2;
for (unsigned int j = 0; j < TileSize; j++) {
if (atom1 < numAtoms && y+tj < numAtoms) {
if (atom1 < NUM_ATOMS && y+tj < NUM_ATOMS) {
float4 delta = (float4) (local_posq[tbx+tj].xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x/periodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y/periodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z/periodicBoxSize.z+0.5f)*periodicBoxSize.z;
delta.x -= floor(delta.x/PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
delta.y -= floor(delta.y/PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y;
delta.z -= floor(delta.z/PERIODIC_BOX_SIZE_Z+0.5f)*PERIODIC_BOX_SIZE_Z;
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float r = sqrt(r2);
......@@ -445,13 +445,13 @@ __kernel void computeGBSAForce1(int numAtoms, int paddedNumAtoms, float prefacto
float expTerm = exp(-D_ij);
float denominator2 = r2 + alpha2_ij*expTerm;
float denominator = sqrt(denominator2);
float tempEnergy = (prefactor*posq1.w*posq2.w)/denominator;
float tempEnergy = (PREFACTOR*posq1.w*posq2.w)/denominator;
float Gpol = tempEnergy/denominator2;
float dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
force.w += dGpol_dalpha2_ij*bornRadius2;
float dEdR = Gpol*(1.0f - 0.25f*expTerm);
#ifdef USE_CUTOFF
if (r2 > cutoffSquared) {
if (r2 > CUTOFF_SQUARED) {
dEdR = 0.0f;
tempEnergy = 0.0f;
}
......@@ -467,11 +467,11 @@ __kernel void computeGBSAForce1(int numAtoms, int paddedNumAtoms, float prefacto
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset1 = x + tgx + (y/TileSize)*paddedNumAtoms;
unsigned int offset2 = y + tgx + (x/TileSize)*paddedNumAtoms;
unsigned int offset1 = x + tgx + (y/TileSize)*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + (x/TileSize)*PADDED_NUM_ATOMS;
#else
unsigned int offset1 = x + tgx + warp*paddedNumAtoms;
unsigned int offset2 = y + tgx + warp*paddedNumAtoms;
unsigned int offset1 = x + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + warp*PADDED_NUM_ATOMS;
#endif
forceBuffers[offset1].xyz += force.xyz;
forceBuffers[offset2].xyz += local_force[get_local_id(0)].xyz;
......@@ -488,11 +488,11 @@ __kernel void computeGBSAForce1(int numAtoms, int paddedNumAtoms, float prefacto
* Reduce the Born force.
*/
__kernel void reduceBornForce(int numAtoms, int bufferSize, int numBuffers, __global float* bornForce, __global float* energyBuffer,
__kernel void reduceBornForce(int bufferSize, int numBuffers, __global float* bornForce, __global float* energyBuffer,
__global float2* params, __global float* bornRadii, __global float* obcChain) {
float energy = 0.0f;
unsigned int index = get_global_id(0);
while (index < numAtoms) {
while (index < NUM_ATOMS) {
// Sum the Born force
int totalSize = bufferSize*numBuffers;
......
#ifdef USE_CUTOFF
if (atom1 < numAtoms && atom2 < numAtoms && atom1 != atom2 && r2 < cutoffSquared) {
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2 && r2 < CUTOFF_SQUARED) {
#else
if (atom1 < numAtoms && atom2 < numAtoms && atom1 != atom2) {
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
#endif
float invRSquared = 1.0f/r2;
float rScaledRadiusJ = r+obcParams2.y;
......
......@@ -4,10 +4,10 @@ const unsigned int TileSize = 32;
* Compute nonbonded interactions.
*/
__kernel void computeNonbonded(int numAtoms, int paddedNumAtoms, __global float4* forceBuffers, __global float* energyBuffer, __global float4* posq,
__kernel void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq,
__global unsigned int* exclusions, __global unsigned int* exclusionIndices, __local float4* local_posq, __local float4* local_force, __global unsigned int* tiles,
#ifdef USE_CUTOFF
float cutoffSquared, float4 periodicBoxSize, __global unsigned int* interactionFlags, __global unsigned int* interactionCount, __local float4* tempBuffer
__global unsigned int* interactionFlags, __global unsigned int* interactionCount, __local float4* tempBuffer
#else
unsigned int numTiles
#endif
......@@ -41,7 +41,7 @@ __kernel void computeNonbonded(int numAtoms, int paddedNumAtoms, __global float4
local_posq[get_local_id(0)] = posq1;
LOAD_LOCAL_PARAMETERS_FROM_1
unsigned int xi = x/TileSize;
unsigned int tile = xi+xi*paddedNumAtoms/TileSize-xi*(xi+1)/2;
unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TileSize-xi*(xi+1)/2;
#ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[exclusionIndices[tile]+tgx];
#endif
......@@ -51,13 +51,13 @@ __kernel void computeNonbonded(int numAtoms, int paddedNumAtoms, __global float4
#endif
float4 delta = (float4) (local_posq[tbx+j].xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x/periodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y/periodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z/periodicBoxSize.z+0.5f)*periodicBoxSize.z;
delta.x -= floor(delta.x/PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
delta.y -= floor(delta.y/PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y;
delta.z -= floor(delta.z/PERIODIC_BOX_SIZE_Z+0.5f)*PERIODIC_BOX_SIZE_Z;
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float r = sqrt(r2);
float invR = 1.0f / r;
float invR = 1.0f/r;
int atom2 = tbx+j;
float4 posq2 = local_posq[atom2];
LOAD_ATOM2_PARAMETERS
......@@ -73,9 +73,9 @@ __kernel void computeNonbonded(int numAtoms, int paddedNumAtoms, __global float4
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset = x + tgx + (x/TileSize)*paddedNumAtoms;
unsigned int offset = x + tgx + (x/TileSize)*PADDED_NUM_ATOMS;
#else
unsigned int offset = x + tgx + warp*paddedNumAtoms;
unsigned int offset = x + tgx + warp*PADDED_NUM_ATOMS;
#endif
forceBuffers[offset].xyz += force.xyz;
}
......@@ -102,13 +102,13 @@ __kernel void computeNonbonded(int numAtoms, int paddedNumAtoms, __global float4
bool isExcluded = false;
float4 delta = (float4) (local_posq[tbx+j].xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x/periodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y/periodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z/periodicBoxSize.z+0.5f)*periodicBoxSize.z;
delta.x -= floor(delta.x/PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
delta.y -= floor(delta.y/PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y;
delta.z -= floor(delta.z/PERIODIC_BOX_SIZE_Z+0.5f)*PERIODIC_BOX_SIZE_Z;
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float r = sqrt(r2);
float invR = 1.0f / r;
float invR = 1.0f/r;
int atom2 = tbx+j;
float4 posq2 = local_posq[atom2];
LOAD_ATOM2_PARAMETERS
......@@ -144,7 +144,7 @@ __kernel void computeNonbonded(int numAtoms, int paddedNumAtoms, __global float4
unsigned int xi = x/TileSize;
unsigned int yi = y/TileSize;
unsigned int tile = xi+yi*paddedNumAtoms/TileSize-yi*(yi+1)/2;
unsigned int tile = xi+yi*PADDED_NUM_ATOMS/TileSize-yi*(yi+1)/2;
#ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndices[tile]+tgx] : 0xFFFFFFFF);
excl = (excl >> tgx) | (excl << (TileSize - tgx));
......@@ -155,13 +155,13 @@ __kernel void computeNonbonded(int numAtoms, int paddedNumAtoms, __global float4
#endif
float4 delta = (float4) (local_posq[tbx+tj].xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x/periodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y/periodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z/periodicBoxSize.z+0.5f)*periodicBoxSize.z;
delta.x -= floor(delta.x/PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
delta.y -= floor(delta.y/PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y;
delta.z -= floor(delta.z/PERIODIC_BOX_SIZE_Z+0.5f)*PERIODIC_BOX_SIZE_Z;
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float r = sqrt(r2);
float invR = 1.0f / r;
float invR = 1.0f/r;
int atom2 = tbx+tj;
float4 posq2 = local_posq[atom2];
LOAD_ATOM2_PARAMETERS
......@@ -180,11 +180,11 @@ __kernel void computeNonbonded(int numAtoms, int paddedNumAtoms, __global float4
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset1 = x + tgx + (y/TileSize)*paddedNumAtoms;
unsigned int offset2 = y + tgx + (x/TileSize)*paddedNumAtoms;
unsigned int offset1 = x + tgx + (y/TileSize)*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + (x/TileSize)*PADDED_NUM_ATOMS;
#else
unsigned int offset1 = x + tgx + warp*paddedNumAtoms;
unsigned int offset2 = y + tgx + warp*paddedNumAtoms;
unsigned int offset1 = x + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + warp*PADDED_NUM_ATOMS;
#endif
forceBuffers[offset1].xyz += force.xyz;
forceBuffers[offset2].xyz += local_force[get_local_id(0)].xyz;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment