Commit d65992ea authored by Peter Eastman's avatar Peter Eastman
Browse files

Optimized nonbonded kernels by inlining lots of constant quantities

parent d3ec1d0c
...@@ -542,11 +542,13 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb ...@@ -542,11 +542,13 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
if (useCutoff) { if (useCutoff) {
double reactionFieldK = pow(force.getCutoffDistance(), -3.0)*(force.getReactionFieldDielectric()-1.0)/(2.0*force.getReactionFieldDielectric()+1.0); double reactionFieldK = pow(force.getCutoffDistance(), -3.0)*(force.getReactionFieldDielectric()-1.0)/(2.0*force.getReactionFieldDielectric()+1.0);
double reactionFieldC = (1.0 / force.getCutoffDistance())*(3.0*force.getReactionFieldDielectric())/(2.0*force.getReactionFieldDielectric()+1.0); double reactionFieldC = (1.0 / force.getCutoffDistance())*(3.0*force.getReactionFieldDielectric())/(2.0*force.getReactionFieldDielectric()+1.0);
char k[50], c[50]; stringstream k, c;
sprintf(k, "%.8ef", reactionFieldK); k.precision(8);
sprintf(c, "%.8ef", reactionFieldC); c.precision(8);
defines["REACTION_FIELD_K"] = string(k); k << scientific << reactionFieldK << "f";
defines["REACTION_FIELD_C"] = string(c); c << scientific << reactionFieldC << "f";
defines["REACTION_FIELD_K"] = k.str();
defines["REACTION_FIELD_C"] = c.str();
} }
// if (force.getNonbondedMethod() != NonbondedForce::NoCutoff) { // if (force.getNonbondedMethod() != NonbondedForce::NoCutoff) {
// method = CUTOFF; // method = CUTOFF;
...@@ -838,73 +840,83 @@ void OpenCLCalcGBSAOBCForceKernel::executeForces(ContextImpl& context) { ...@@ -838,73 +840,83 @@ void OpenCLCalcGBSAOBCForceKernel::executeForces(ContextImpl& context) {
defines["USE_CUTOFF"] = "1"; defines["USE_CUTOFF"] = "1";
if (nb.getUsePeriodic()) if (nb.getUsePeriodic())
defines["USE_PERIODIC"] = "1"; defines["USE_PERIODIC"] = "1";
stringstream xsize, ysize, zsize, cutoffSquared, prefac;
xsize.precision(8);
ysize.precision(8);
zsize.precision(8);
cutoffSquared.precision(8);
prefac.precision(8);
xsize << scientific << nb.getPeriodicBoxSize().x << "f";
ysize << scientific << nb.getPeriodicBoxSize().y << "f";
zsize << scientific << nb.getPeriodicBoxSize().z << "f";
cutoffSquared << scientific << (nb.getCutoffDistance()*nb.getCutoffDistance()) << "f";
prefac << scientific << prefactor << "f";
defines["PERIODIC_BOX_SIZE_X"] = xsize.str();
defines["PERIODIC_BOX_SIZE_Y"] = ysize.str();
defines["PERIODIC_BOX_SIZE_Z"] = zsize.str();
defines["CUTOFF_SQUARED"] = cutoffSquared.str();
defines["PREFACTOR"] = prefac.str();
stringstream natom, padded;
natom << cl.getNumAtoms();
padded << cl.getPaddedNumAtoms();
defines["NUM_ATOMS"] = natom.str();
defines["PADDED_NUM_ATOMS"] = padded.str();
cl::Program program = cl.createProgram(cl.loadSourceFromFile("gbsaObc.cl"), defines); cl::Program program = cl.createProgram(cl.loadSourceFromFile("gbsaObc.cl"), defines);
computeBornSumKernel = cl::Kernel(program, "computeBornSum"); computeBornSumKernel = cl::Kernel(program, "computeBornSum");
computeBornSumKernel.setArg<cl_int>(0, cl.getNumAtoms()); computeBornSumKernel.setArg<cl::Buffer>(0, bornSum->getDeviceBuffer());
computeBornSumKernel.setArg<cl_int>(1, cl.getPaddedNumAtoms()); computeBornSumKernel.setArg(1, OpenCLContext::ThreadBlockSize*sizeof(cl_float), NULL);
computeBornSumKernel.setArg<cl::Buffer>(2, bornSum->getDeviceBuffer()); computeBornSumKernel.setArg<cl::Buffer>(2, cl.getPosq().getDeviceBuffer());
computeBornSumKernel.setArg(3, OpenCLContext::ThreadBlockSize*sizeof(cl_float), NULL); computeBornSumKernel.setArg(3, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL);
computeBornSumKernel.setArg<cl::Buffer>(4, cl.getPosq().getDeviceBuffer()); computeBornSumKernel.setArg<cl::Buffer>(4, params->getDeviceBuffer());
computeBornSumKernel.setArg(5, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL); computeBornSumKernel.setArg(5, OpenCLContext::ThreadBlockSize*sizeof(cl_float2), NULL);
computeBornSumKernel.setArg<cl::Buffer>(6, params->getDeviceBuffer());
computeBornSumKernel.setArg(7, OpenCLContext::ThreadBlockSize*sizeof(cl_float2), NULL);
if (nb.getUseCutoff()) { if (nb.getUseCutoff()) {
computeBornSumKernel.setArg<cl::Buffer>(8, nb.getInteractingTiles().getDeviceBuffer()); computeBornSumKernel.setArg<cl::Buffer>(6, nb.getInteractingTiles().getDeviceBuffer());
computeBornSumKernel.setArg<cl_float>(9, nb.getCutoffDistance()*nb.getCutoffDistance()); computeBornSumKernel.setArg<cl::Buffer>(7, nb.getInteractionFlags().getDeviceBuffer());
computeBornSumKernel.setArg<mm_float4>(10, nb.getPeriodicBoxSize()); computeBornSumKernel.setArg<cl::Buffer>(8, nb.getInteractionCount().getDeviceBuffer());
computeBornSumKernel.setArg<cl::Buffer>(11, nb.getInteractionFlags().getDeviceBuffer()); computeBornSumKernel.setArg(9, OpenCLContext::ThreadBlockSize*sizeof(cl_float), NULL);
computeBornSumKernel.setArg<cl::Buffer>(12, nb.getInteractionCount().getDeviceBuffer());
computeBornSumKernel.setArg(13, OpenCLContext::ThreadBlockSize*sizeof(cl_float), NULL);
} }
else { else {
computeBornSumKernel.setArg<cl::Buffer>(8, nb.getTiles().getDeviceBuffer()); computeBornSumKernel.setArg<cl::Buffer>(6, nb.getTiles().getDeviceBuffer());
computeBornSumKernel.setArg<cl_uint>(9, nb.getTiles().getSize()); computeBornSumKernel.setArg<cl_uint>(7, nb.getTiles().getSize());
} }
reduceBornSumKernel = cl::Kernel(program, "reduceBornSum"); reduceBornSumKernel = cl::Kernel(program, "reduceBornSum");
reduceBornSumKernel.setArg<cl_int>(0, cl.getNumAtoms()); reduceBornSumKernel.setArg<cl_int>(0, cl.getPaddedNumAtoms());
reduceBornSumKernel.setArg<cl_int>(1, cl.getPaddedNumAtoms()); reduceBornSumKernel.setArg<cl_int>(1, cl.getNumForceBuffers());
reduceBornSumKernel.setArg<cl_int>(2, cl.getNumForceBuffers()); reduceBornSumKernel.setArg<cl_float>(2, 1.0f);
reduceBornSumKernel.setArg<cl_float>(3, 1.0f); reduceBornSumKernel.setArg<cl_float>(3, 0.8f);
reduceBornSumKernel.setArg<cl_float>(4, 0.8f); reduceBornSumKernel.setArg<cl_float>(4, 4.85f);
reduceBornSumKernel.setArg<cl_float>(5, 4.85f); reduceBornSumKernel.setArg<cl::Buffer>(5, bornSum->getDeviceBuffer());
reduceBornSumKernel.setArg<cl::Buffer>(6, bornSum->getDeviceBuffer()); reduceBornSumKernel.setArg<cl::Buffer>(6, params->getDeviceBuffer());
reduceBornSumKernel.setArg<cl::Buffer>(7, params->getDeviceBuffer()); reduceBornSumKernel.setArg<cl::Buffer>(7, bornRadii->getDeviceBuffer());
reduceBornSumKernel.setArg<cl::Buffer>(8, bornRadii->getDeviceBuffer()); reduceBornSumKernel.setArg<cl::Buffer>(8, obcChain->getDeviceBuffer());
reduceBornSumKernel.setArg<cl::Buffer>(9, obcChain->getDeviceBuffer());
force1Kernel = cl::Kernel(program, "computeGBSAForce1"); force1Kernel = cl::Kernel(program, "computeGBSAForce1");
force1Kernel.setArg<cl_int>(0, cl.getNumAtoms()); force1Kernel.setArg<cl::Buffer>(0, cl.getForceBuffers().getDeviceBuffer());
force1Kernel.setArg<cl_int>(1, cl.getPaddedNumAtoms()); force1Kernel.setArg<cl::Buffer>(1, cl.getEnergyBuffer().getDeviceBuffer());
force1Kernel.setArg<cl_float>(2, prefactor); force1Kernel.setArg<cl::Buffer>(2, cl.getPosq().getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(3, cl.getForceBuffers().getDeviceBuffer()); force1Kernel.setArg(3, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL);
force1Kernel.setArg<cl::Buffer>(4, cl.getEnergyBuffer().getDeviceBuffer()); force1Kernel.setArg(4, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL);
force1Kernel.setArg<cl::Buffer>(5, cl.getPosq().getDeviceBuffer()); force1Kernel.setArg<cl::Buffer>(5, bornRadii->getDeviceBuffer());
force1Kernel.setArg(6, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL); force1Kernel.setArg(6, OpenCLContext::ThreadBlockSize*sizeof(cl_float), NULL);
force1Kernel.setArg(7, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL); force1Kernel.setArg<cl::Buffer>(7, bornForce->getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(8, bornRadii->getDeviceBuffer()); force1Kernel.setArg(8, OpenCLContext::ThreadBlockSize*sizeof(cl_float), NULL);
force1Kernel.setArg(9, OpenCLContext::ThreadBlockSize*sizeof(cl_float), NULL);
force1Kernel.setArg<cl::Buffer>(10, bornForce->getDeviceBuffer());
force1Kernel.setArg(11, OpenCLContext::ThreadBlockSize*sizeof(cl_float), NULL);
if (nb.getUseCutoff()) { if (nb.getUseCutoff()) {
force1Kernel.setArg<cl::Buffer>(12, nb.getInteractingTiles().getDeviceBuffer()); force1Kernel.setArg<cl::Buffer>(9, nb.getInteractingTiles().getDeviceBuffer());
force1Kernel.setArg<cl_float>(13, nb.getCutoffDistance()*nb.getCutoffDistance()); force1Kernel.setArg<cl::Buffer>(10, nb.getInteractionFlags().getDeviceBuffer());
force1Kernel.setArg<mm_float4>(14, nb.getPeriodicBoxSize()); force1Kernel.setArg<cl::Buffer>(11, nb.getInteractionCount().getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(15, nb.getInteractionFlags().getDeviceBuffer()); force1Kernel.setArg(12, OpenCLContext::ThreadBlockSize*sizeof(mm_float4), NULL);
force1Kernel.setArg<cl::Buffer>(16, nb.getInteractionCount().getDeviceBuffer());
force1Kernel.setArg(17, OpenCLContext::ThreadBlockSize*sizeof(mm_float4), NULL);
} }
else { else {
force1Kernel.setArg<cl::Buffer>(12, nb.getTiles().getDeviceBuffer()); force1Kernel.setArg<cl::Buffer>(9, nb.getTiles().getDeviceBuffer());
force1Kernel.setArg<cl_uint>(13, nb.getTiles().getSize()); force1Kernel.setArg<cl_uint>(10, nb.getTiles().getSize());
} }
reduceBornForceKernel = cl::Kernel(program, "reduceBornForce"); reduceBornForceKernel = cl::Kernel(program, "reduceBornForce");
reduceBornForceKernel.setArg<cl_int>(0, cl.getNumAtoms()); reduceBornForceKernel.setArg<cl_int>(0, cl.getPaddedNumAtoms());
reduceBornForceKernel.setArg<cl_int>(1, cl.getPaddedNumAtoms()); reduceBornForceKernel.setArg<cl_int>(1, cl.getNumForceBuffers());
reduceBornForceKernel.setArg<cl_int>(2, cl.getNumForceBuffers()); reduceBornForceKernel.setArg<cl::Buffer>(2, bornForce->getDeviceBuffer());
reduceBornForceKernel.setArg<cl::Buffer>(3, bornForce->getDeviceBuffer()); reduceBornForceKernel.setArg<cl::Buffer>(3, cl.getEnergyBuffer().getDeviceBuffer());
reduceBornForceKernel.setArg<cl::Buffer>(4, cl.getEnergyBuffer().getDeviceBuffer()); reduceBornForceKernel.setArg<cl::Buffer>(4, params->getDeviceBuffer());
reduceBornForceKernel.setArg<cl::Buffer>(5, params->getDeviceBuffer()); reduceBornForceKernel.setArg<cl::Buffer>(5, bornRadii->getDeviceBuffer());
reduceBornForceKernel.setArg<cl::Buffer>(6, bornRadii->getDeviceBuffer()); reduceBornForceKernel.setArg<cl::Buffer>(6, obcChain->getDeviceBuffer());
reduceBornForceKernel.setArg<cl::Buffer>(7, obcChain->getDeviceBuffer());
} }
cl.clearBuffer(*bornSum); cl.clearBuffer(*bornSum);
cl.clearBuffer(*bornForce); cl.clearBuffer(*bornForce);
......
...@@ -327,33 +327,47 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc ...@@ -327,33 +327,47 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
defines["USE_PERIODIC"] = "1"; defines["USE_PERIODIC"] = "1";
if (useExclusions) if (useExclusions)
defines["USE_EXCLUSIONS"] = "1"; defines["USE_EXCLUSIONS"] = "1";
stringstream xsize, ysize, zsize, cutoffSquared;
xsize.precision(8);
ysize.precision(8);
zsize.precision(8);
cutoffSquared.precision(8);
xsize << scientific << periodicBoxSize.x << "f";
ysize << scientific << periodicBoxSize.y << "f";
zsize << scientific << periodicBoxSize.z << "f";
cutoffSquared << scientific << (cutoff*cutoff) << "f";
defines["PERIODIC_BOX_SIZE_X"] = xsize.str();
defines["PERIODIC_BOX_SIZE_Y"] = ysize.str();
defines["PERIODIC_BOX_SIZE_Z"] = zsize.str();
defines["CUTOFF_SQUARED"] = cutoffSquared.str();
stringstream natom, padded;
natom << context.getNumAtoms();
padded << context.getPaddedNumAtoms();
defines["NUM_ATOMS"] = natom.str();
defines["PADDED_NUM_ATOMS"] = padded.str();
cl::Program program = context.createProgram(context.loadSourceFromFile("nonbonded.cl", replacements), defines); cl::Program program = context.createProgram(context.loadSourceFromFile("nonbonded.cl", replacements), defines);
cl::Kernel kernel(program, "computeNonbonded"); cl::Kernel kernel(program, "computeNonbonded");
// Set arguments to the Kernel. // Set arguments to the Kernel.
kernel.setArg<cl_int>(0, context.getNumAtoms()); kernel.setArg<cl::Buffer>(0, context.getForceBuffers().getDeviceBuffer());
kernel.setArg<cl_int>(1, context.getPaddedNumAtoms()); kernel.setArg<cl::Buffer>(1, context.getEnergyBuffer().getDeviceBuffer());
kernel.setArg<cl::Buffer>(2, context.getForceBuffers().getDeviceBuffer()); kernel.setArg<cl::Buffer>(2, context.getPosq().getDeviceBuffer());
kernel.setArg<cl::Buffer>(3, context.getEnergyBuffer().getDeviceBuffer()); kernel.setArg<cl::Buffer>(3, exclusions->getDeviceBuffer());
kernel.setArg<cl::Buffer>(4, context.getPosq().getDeviceBuffer()); kernel.setArg<cl::Buffer>(4, exclusionIndex->getDeviceBuffer());
kernel.setArg<cl::Buffer>(5, exclusions->getDeviceBuffer()); kernel.setArg(5, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL);
kernel.setArg<cl::Buffer>(6, exclusionIndex->getDeviceBuffer()); kernel.setArg(6, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL);
kernel.setArg(7, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL); int paramBase = 9;
kernel.setArg(8, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL);
int paramBase = 11;
if (useCutoff) { if (useCutoff) {
paramBase = 15; paramBase = 11;
kernel.setArg<cl::Buffer>(9, interactingTiles->getDeviceBuffer()); kernel.setArg<cl::Buffer>(7, interactingTiles->getDeviceBuffer());
kernel.setArg<cl_float>(10, cutoff*cutoff); kernel.setArg<cl::Buffer>(8, interactionFlags->getDeviceBuffer());
kernel.setArg<mm_float4>(11, periodicBoxSize); kernel.setArg<cl::Buffer>(9, interactionCount->getDeviceBuffer());
kernel.setArg<cl::Buffer>(12, interactionFlags->getDeviceBuffer()); kernel.setArg(10, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL);
kernel.setArg<cl::Buffer>(13, interactionCount->getDeviceBuffer());
kernel.setArg(14, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL);
} }
else { else {
kernel.setArg<cl::Buffer>(9, tiles->getDeviceBuffer()); kernel.setArg<cl::Buffer>(7, tiles->getDeviceBuffer());
kernel.setArg<cl_uint>(10, tiles->getSize()); kernel.setArg<cl_uint>(8, tiles->getSize());
} }
for (int i = 0; i < (int) params.size(); i++) { for (int i = 0; i < (int) params.size(); i++) {
kernel.setArg<cl::Buffer>(i*2+paramBase, params[i].getBuffer()); kernel.setArg<cl::Buffer>(i*2+paramBase, params[i].getBuffer());
......
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (!isExcluded && r2 < cutoffSquared) { if (!isExcluded && r2 < CUTOFF_SQUARED) {
#else #else
if (!isExcluded) { if (!isExcluded) {
#endif #endif
......
...@@ -7,10 +7,10 @@ const float surfaceAreaFactor = -6.0f*3.14159265358979323846f*0.0216f*1000.0f*0. ...@@ -7,10 +7,10 @@ const float surfaceAreaFactor = -6.0f*3.14159265358979323846f*0.0216f*1000.0f*0.
* Compute the Born sum. * Compute the Born sum.
*/ */
__kernel void computeBornSum(int numAtoms, int paddedNumAtoms, __global float* global_bornSum, __local float* local_bornSum, __global float4* posq, __kernel void computeBornSum(__global float* global_bornSum, __local float* local_bornSum, __global float4* posq,
__local float4* local_posq, __global float2* global_params, __local float2* local_params, __global unsigned int* tiles, __local float4* local_posq, __global float2* global_params, __local float2* local_params, __global unsigned int* tiles,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
float cutoffSquared, float4 periodicBoxSize, __global unsigned int* interactionFlags, __global unsigned int* interactionCount, __local float* tempBuffer) { __global unsigned int* interactionFlags, __global unsigned int* interactionCount, __local float* tempBuffer) {
#else #else
unsigned int numTiles) { unsigned int numTiles) {
#endif #endif
...@@ -42,19 +42,19 @@ __kernel void computeBornSum(int numAtoms, int paddedNumAtoms, __global float* g ...@@ -42,19 +42,19 @@ __kernel void computeBornSum(int numAtoms, int paddedNumAtoms, __global float* g
local_posq[get_local_id(0)] = posq1; local_posq[get_local_id(0)] = posq1;
local_params[get_local_id(0)] = params1; local_params[get_local_id(0)] = params1;
unsigned int xi = x/TileSize; unsigned int xi = x/TileSize;
unsigned int tile = xi+xi*paddedNumAtoms/TileSize-xi*(xi+1)/2; unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TileSize-xi*(xi+1)/2;
for (unsigned int j = 0; j < TileSize; j++) { for (unsigned int j = 0; j < TileSize; j++) {
float4 delta = (float4) (local_posq[tbx+j].xyz - posq1.xyz, 0.0f); float4 delta = (float4) (local_posq[tbx+j].xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x/periodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x/PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
delta.y -= floor(delta.y/periodicBoxSize.y+0.5f)*periodicBoxSize.y; delta.y -= floor(delta.y/PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y;
delta.z -= floor(delta.z/periodicBoxSize.z+0.5f)*periodicBoxSize.z; delta.z -= floor(delta.z/PERIODIC_BOX_SIZE_Z+0.5f)*PERIODIC_BOX_SIZE_Z;
#endif #endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (atom1 < numAtoms && y+j < numAtoms && r2 < cutoffSquared) { if (atom1 < NUM_ATOMS && y+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else #else
if (atom1 < numAtoms && y+j < numAtoms) { if (atom1 < NUM_ATOMS && y+j < NUM_ATOMS) {
#endif #endif
float r = sqrt(r2); float r = sqrt(r2);
float invR = 1.0f/r; float invR = 1.0f/r;
...@@ -76,9 +76,9 @@ __kernel void computeBornSum(int numAtoms, int paddedNumAtoms, __global float* g ...@@ -76,9 +76,9 @@ __kernel void computeBornSum(int numAtoms, int paddedNumAtoms, __global float* g
// Write results // Write results
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset = x + tgx + (x/TileSize)*paddedNumAtoms; unsigned int offset = x + tgx + (x/TileSize)*PADDED_NUM_ATOMS;
#else #else
unsigned int offset = x + tgx + warp*paddedNumAtoms; unsigned int offset = x + tgx + warp*PADDED_NUM_ATOMS;
#endif #endif
global_bornSum[offset] += bornSum; global_bornSum[offset] += bornSum;
} }
...@@ -104,16 +104,16 @@ __kernel void computeBornSum(int numAtoms, int paddedNumAtoms, __global float* g ...@@ -104,16 +104,16 @@ __kernel void computeBornSum(int numAtoms, int paddedNumAtoms, __global float* g
if ((flags&(1<<j)) != 0) { if ((flags&(1<<j)) != 0) {
float4 delta = (float4) (local_posq[tbx+j].xyz - posq1.xyz, 0.0f); float4 delta = (float4) (local_posq[tbx+j].xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x/periodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x/PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
delta.y -= floor(delta.y/periodicBoxSize.y+0.5f)*periodicBoxSize.y; delta.y -= floor(delta.y/PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y;
delta.z -= floor(delta.z/periodicBoxSize.z+0.5f)*periodicBoxSize.z; delta.z -= floor(delta.z/PERIODIC_BOX_SIZE_Z+0.5f)*PERIODIC_BOX_SIZE_Z;
#endif #endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
tempBuffer[get_local_id(0)] = 0.0f; tempBuffer[get_local_id(0)] = 0.0f;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (atom1 < numAtoms && y+j < numAtoms && r2 < cutoffSquared) { if (atom1 < NUM_ATOMS && y+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else #else
if (atom1 < numAtoms && y+j < numAtoms) { if (atom1 < NUM_ATOMS && y+j < NUM_ATOMS) {
#endif #endif
float r = sqrt(r2); float r = sqrt(r2);
float invR = 1.0f/r; float invR = 1.0f/r;
...@@ -168,19 +168,19 @@ __kernel void computeBornSum(int numAtoms, int paddedNumAtoms, __global float* g ...@@ -168,19 +168,19 @@ __kernel void computeBornSum(int numAtoms, int paddedNumAtoms, __global float* g
unsigned int xi = x/TileSize; unsigned int xi = x/TileSize;
unsigned int yi = y/TileSize; unsigned int yi = y/TileSize;
unsigned int tile = xi+yi*paddedNumAtoms/TileSize-yi*(yi+1)/2; unsigned int tile = xi+yi*PADDED_NUM_ATOMS/TileSize-yi*(yi+1)/2;
for (unsigned int j = 0; j < TileSize; j++) { for (unsigned int j = 0; j < TileSize; j++) {
float4 delta = (float4) (local_posq[tbx+tj].xyz - posq1.xyz, 0.0f); float4 delta = (float4) (local_posq[tbx+tj].xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x/periodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x/PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
delta.y -= floor(delta.y/periodicBoxSize.y+0.5f)*periodicBoxSize.y; delta.y -= floor(delta.y/PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y;
delta.z -= floor(delta.z/periodicBoxSize.z+0.5f)*periodicBoxSize.z; delta.z -= floor(delta.z/PERIODIC_BOX_SIZE_Z+0.5f)*PERIODIC_BOX_SIZE_Z;
#endif #endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (atom1 < numAtoms && y+tj < numAtoms && r2 < cutoffSquared) { if (atom1 < NUM_ATOMS && y+tj < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else #else
if (atom1 < numAtoms && y+tj < numAtoms) { if (atom1 < NUM_ATOMS && y+tj < NUM_ATOMS) {
#endif #endif
float r = sqrt(r2); float r = sqrt(r2);
float invR = 1.0f/r; float invR = 1.0f/r;
...@@ -218,11 +218,11 @@ __kernel void computeBornSum(int numAtoms, int paddedNumAtoms, __global float* g ...@@ -218,11 +218,11 @@ __kernel void computeBornSum(int numAtoms, int paddedNumAtoms, __global float* g
// Write results // Write results
float4 of; float4 of;
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset1 = x + tgx + (y/TileSize)*paddedNumAtoms; unsigned int offset1 = x + tgx + (y/TileSize)*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + (x/TileSize)*paddedNumAtoms; unsigned int offset2 = y + tgx + (x/TileSize)*PADDED_NUM_ATOMS;
#else #else
unsigned int offset1 = x + tgx + warp*paddedNumAtoms; unsigned int offset1 = x + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + warp*paddedNumAtoms; unsigned int offset2 = y + tgx + warp*PADDED_NUM_ATOMS;
#endif #endif
global_bornSum[offset1] += bornSum; global_bornSum[offset1] += bornSum;
global_bornSum[offset2] += local_bornSum[get_local_id(0)]; global_bornSum[offset2] += local_bornSum[get_local_id(0)];
...@@ -236,10 +236,10 @@ __kernel void computeBornSum(int numAtoms, int paddedNumAtoms, __global float* g ...@@ -236,10 +236,10 @@ __kernel void computeBornSum(int numAtoms, int paddedNumAtoms, __global float* g
* Reduce the Born sums to compute the Born radii. * Reduce the Born sums to compute the Born radii.
*/ */
__kernel void reduceBornSum(int numAtoms, int bufferSize, int numBuffers, float alpha, float beta, float gamma, __kernel void reduceBornSum(int bufferSize, int numBuffers, float alpha, float beta, float gamma,
__global float* bornSum, __global float2* params, __global float* bornRadii, __global float* obcChain) { __global float* bornSum, __global float2* params, __global float* bornRadii, __global float* obcChain) {
unsigned int index = get_global_id(0); unsigned int index = get_global_id(0);
while (index < numAtoms) { while (index < NUM_ATOMS) {
// Get summed Born data // Get summed Born data
int totalSize = bufferSize*numBuffers; int totalSize = bufferSize*numBuffers;
...@@ -268,11 +268,11 @@ __kernel void reduceBornSum(int numAtoms, int bufferSize, int numBuffers, float ...@@ -268,11 +268,11 @@ __kernel void reduceBornSum(int numAtoms, int bufferSize, int numBuffers, float
* First part of computing the GBSA interaction. * First part of computing the GBSA interaction.
*/ */
__kernel void computeGBSAForce1(int numAtoms, int paddedNumAtoms, float prefactor, __global float4* forceBuffers, __global float* energyBuffer, __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer,
__global float4* posq, __local float4* local_posq, __local float4* local_force, __global float* global_bornRadii, __local float* local_bornRadii, __global float4* posq, __local float4* local_posq, __local float4* local_force, __global float* global_bornRadii, __local float* local_bornRadii,
__global float* global_bornForce, __local float* local_bornForce, __global unsigned int* tiles, __global float* global_bornForce, __local float* local_bornForce, __global unsigned int* tiles,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
float cutoffSquared, float4 periodicBoxSize, __global unsigned int* interactionFlags, __global unsigned int* interactionCount, __local float4* tempBuffer) { __global unsigned int* interactionFlags, __global unsigned int* interactionCount, __local float4* tempBuffer) {
#else #else
unsigned int numTiles) { unsigned int numTiles) {
#endif #endif
...@@ -304,14 +304,14 @@ __kernel void computeGBSAForce1(int numAtoms, int paddedNumAtoms, float prefacto ...@@ -304,14 +304,14 @@ __kernel void computeGBSAForce1(int numAtoms, int paddedNumAtoms, float prefacto
local_posq[get_local_id(0)] = posq1; local_posq[get_local_id(0)] = posq1;
local_bornRadii[get_local_id(0)] = bornRadius1; local_bornRadii[get_local_id(0)] = bornRadius1;
unsigned int xi = x/TileSize; unsigned int xi = x/TileSize;
unsigned int tile = xi+xi*paddedNumAtoms/TileSize-xi*(xi+1)/2; unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TileSize-xi*(xi+1)/2;
for (unsigned int j = 0; j < TileSize; j++) { for (unsigned int j = 0; j < TileSize; j++) {
if (atom1 < numAtoms && y+j < numAtoms) { if (atom1 < NUM_ATOMS && y+j < NUM_ATOMS) {
float4 delta = (float4) (local_posq[tbx+j].xyz - posq1.xyz, 0.0f); float4 delta = (float4) (local_posq[tbx+j].xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x/periodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x/PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
delta.y -= floor(delta.y/periodicBoxSize.y+0.5f)*periodicBoxSize.y; delta.y -= floor(delta.y/PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y;
delta.z -= floor(delta.z/periodicBoxSize.z+0.5f)*periodicBoxSize.z; delta.z -= floor(delta.z/PERIODIC_BOX_SIZE_Z+0.5f)*PERIODIC_BOX_SIZE_Z;
#endif #endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float r = sqrt(r2); float r = sqrt(r2);
...@@ -323,13 +323,13 @@ __kernel void computeGBSAForce1(int numAtoms, int paddedNumAtoms, float prefacto ...@@ -323,13 +323,13 @@ __kernel void computeGBSAForce1(int numAtoms, int paddedNumAtoms, float prefacto
float expTerm = exp(-D_ij); float expTerm = exp(-D_ij);
float denominator2 = r2 + alpha2_ij*expTerm; float denominator2 = r2 + alpha2_ij*expTerm;
float denominator = sqrt(denominator2); float denominator = sqrt(denominator2);
float tempEnergy = (prefactor*posq1.w*posq2.w)/denominator; float tempEnergy = (PREFACTOR*posq1.w*posq2.w)/denominator;
float Gpol = tempEnergy/denominator2; float Gpol = tempEnergy/denominator2;
float dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij); float dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
force.w += dGpol_dalpha2_ij*bornRadius2; force.w += dGpol_dalpha2_ij*bornRadius2;
float dEdR = Gpol*(1.0f - 0.25f*expTerm); float dEdR = Gpol*(1.0f - 0.25f*expTerm);
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (r2 > cutoffSquared) { if (r2 > CUTOFF_SQUARED) {
dEdR = 0.0f; dEdR = 0.0f;
tempEnergy = 0.0f; tempEnergy = 0.0f;
} }
...@@ -342,9 +342,9 @@ __kernel void computeGBSAForce1(int numAtoms, int paddedNumAtoms, float prefacto ...@@ -342,9 +342,9 @@ __kernel void computeGBSAForce1(int numAtoms, int paddedNumAtoms, float prefacto
// Write results // Write results
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset = x + tgx + (x/TileSize)*paddedNumAtoms; unsigned int offset = x + tgx + (x/TileSize)*PADDED_NUM_ATOMS;
#else #else
unsigned int offset = x + tgx + warp*paddedNumAtoms; unsigned int offset = x + tgx + warp*PADDED_NUM_ATOMS;
#endif #endif
forceBuffers[offset].xyz += force.xyz; forceBuffers[offset].xyz += force.xyz;
global_bornForce[offset] += force.w; global_bornForce[offset] += force.w;
...@@ -371,9 +371,9 @@ __kernel void computeGBSAForce1(int numAtoms, int paddedNumAtoms, float prefacto ...@@ -371,9 +371,9 @@ __kernel void computeGBSAForce1(int numAtoms, int paddedNumAtoms, float prefacto
if ((flags&(1<<j)) != 0) { if ((flags&(1<<j)) != 0) {
float4 delta = (float4) (local_posq[tbx+j].xyz - posq1.xyz, 0.0f); float4 delta = (float4) (local_posq[tbx+j].xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x/periodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x/PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
delta.y -= floor(delta.y/periodicBoxSize.y+0.5f)*periodicBoxSize.y; delta.y -= floor(delta.y/PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y;
delta.z -= floor(delta.z/periodicBoxSize.z+0.5f)*periodicBoxSize.z; delta.z -= floor(delta.z/PERIODIC_BOX_SIZE_Z+0.5f)*PERIODIC_BOX_SIZE_Z;
#endif #endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float r = sqrt(r2); float r = sqrt(r2);
...@@ -385,15 +385,15 @@ __kernel void computeGBSAForce1(int numAtoms, int paddedNumAtoms, float prefacto ...@@ -385,15 +385,15 @@ __kernel void computeGBSAForce1(int numAtoms, int paddedNumAtoms, float prefacto
float expTerm = exp(-D_ij); float expTerm = exp(-D_ij);
float denominator2 = r2 + alpha2_ij*expTerm; float denominator2 = r2 + alpha2_ij*expTerm;
float denominator = sqrt(denominator2); float denominator = sqrt(denominator2);
float tempEnergy = (prefactor*posq1.w*posq2.w)/denominator; float tempEnergy = (PREFACTOR*posq1.w*posq2.w)/denominator;
float Gpol = tempEnergy/denominator2; float Gpol = tempEnergy/denominator2;
float dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij); float dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
force.w += dGpol_dalpha2_ij*bornRadius2; force.w += dGpol_dalpha2_ij*bornRadius2;
float dEdR = Gpol*(1.0f - 0.25f*expTerm); float dEdR = Gpol*(1.0f - 0.25f*expTerm);
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (atom1 >= numAtoms || y+j >= numAtoms || r2 > cutoffSquared) { if (atom1 >= NUM_ATOMS || y+j >= NUM_ATOMS || r2 > CUTOFF_SQUARED) {
#else #else
if (atom1 >= numAtoms || y+j >= numAtoms) { if (atom1 >= NUM_ATOMS || y+j >= NUM_ATOMS) {
#endif #endif
dEdR = 0.0f; dEdR = 0.0f;
tempEnergy = 0.0f; tempEnergy = 0.0f;
...@@ -426,14 +426,14 @@ __kernel void computeGBSAForce1(int numAtoms, int paddedNumAtoms, float prefacto ...@@ -426,14 +426,14 @@ __kernel void computeGBSAForce1(int numAtoms, int paddedNumAtoms, float prefacto
unsigned int xi = x/TileSize; unsigned int xi = x/TileSize;
unsigned int yi = y/TileSize; unsigned int yi = y/TileSize;
unsigned int tile = xi+yi*paddedNumAtoms/TileSize-yi*(yi+1)/2; unsigned int tile = xi+yi*PADDED_NUM_ATOMS/TileSize-yi*(yi+1)/2;
for (unsigned int j = 0; j < TileSize; j++) { for (unsigned int j = 0; j < TileSize; j++) {
if (atom1 < numAtoms && y+tj < numAtoms) { if (atom1 < NUM_ATOMS && y+tj < NUM_ATOMS) {
float4 delta = (float4) (local_posq[tbx+tj].xyz - posq1.xyz, 0.0f); float4 delta = (float4) (local_posq[tbx+tj].xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x/periodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x/PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
delta.y -= floor(delta.y/periodicBoxSize.y+0.5f)*periodicBoxSize.y; delta.y -= floor(delta.y/PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y;
delta.z -= floor(delta.z/periodicBoxSize.z+0.5f)*periodicBoxSize.z; delta.z -= floor(delta.z/PERIODIC_BOX_SIZE_Z+0.5f)*PERIODIC_BOX_SIZE_Z;
#endif #endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float r = sqrt(r2); float r = sqrt(r2);
...@@ -445,13 +445,13 @@ __kernel void computeGBSAForce1(int numAtoms, int paddedNumAtoms, float prefacto ...@@ -445,13 +445,13 @@ __kernel void computeGBSAForce1(int numAtoms, int paddedNumAtoms, float prefacto
float expTerm = exp(-D_ij); float expTerm = exp(-D_ij);
float denominator2 = r2 + alpha2_ij*expTerm; float denominator2 = r2 + alpha2_ij*expTerm;
float denominator = sqrt(denominator2); float denominator = sqrt(denominator2);
float tempEnergy = (prefactor*posq1.w*posq2.w)/denominator; float tempEnergy = (PREFACTOR*posq1.w*posq2.w)/denominator;
float Gpol = tempEnergy/denominator2; float Gpol = tempEnergy/denominator2;
float dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij); float dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
force.w += dGpol_dalpha2_ij*bornRadius2; force.w += dGpol_dalpha2_ij*bornRadius2;
float dEdR = Gpol*(1.0f - 0.25f*expTerm); float dEdR = Gpol*(1.0f - 0.25f*expTerm);
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (r2 > cutoffSquared) { if (r2 > CUTOFF_SQUARED) {
dEdR = 0.0f; dEdR = 0.0f;
tempEnergy = 0.0f; tempEnergy = 0.0f;
} }
...@@ -467,11 +467,11 @@ __kernel void computeGBSAForce1(int numAtoms, int paddedNumAtoms, float prefacto ...@@ -467,11 +467,11 @@ __kernel void computeGBSAForce1(int numAtoms, int paddedNumAtoms, float prefacto
// Write results // Write results
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset1 = x + tgx + (y/TileSize)*paddedNumAtoms; unsigned int offset1 = x + tgx + (y/TileSize)*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + (x/TileSize)*paddedNumAtoms; unsigned int offset2 = y + tgx + (x/TileSize)*PADDED_NUM_ATOMS;
#else #else
unsigned int offset1 = x + tgx + warp*paddedNumAtoms; unsigned int offset1 = x + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + warp*paddedNumAtoms; unsigned int offset2 = y + tgx + warp*PADDED_NUM_ATOMS;
#endif #endif
forceBuffers[offset1].xyz += force.xyz; forceBuffers[offset1].xyz += force.xyz;
forceBuffers[offset2].xyz += local_force[get_local_id(0)].xyz; forceBuffers[offset2].xyz += local_force[get_local_id(0)].xyz;
...@@ -488,11 +488,11 @@ __kernel void computeGBSAForce1(int numAtoms, int paddedNumAtoms, float prefacto ...@@ -488,11 +488,11 @@ __kernel void computeGBSAForce1(int numAtoms, int paddedNumAtoms, float prefacto
* Reduce the Born force. * Reduce the Born force.
*/ */
__kernel void reduceBornForce(int numAtoms, int bufferSize, int numBuffers, __global float* bornForce, __global float* energyBuffer, __kernel void reduceBornForce(int bufferSize, int numBuffers, __global float* bornForce, __global float* energyBuffer,
__global float2* params, __global float* bornRadii, __global float* obcChain) { __global float2* params, __global float* bornRadii, __global float* obcChain) {
float energy = 0.0f; float energy = 0.0f;
unsigned int index = get_global_id(0); unsigned int index = get_global_id(0);
while (index < numAtoms) { while (index < NUM_ATOMS) {
// Sum the Born force // Sum the Born force
int totalSize = bufferSize*numBuffers; int totalSize = bufferSize*numBuffers;
......
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (atom1 < numAtoms && atom2 < numAtoms && atom1 != atom2 && r2 < cutoffSquared) { if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2 && r2 < CUTOFF_SQUARED) {
#else #else
if (atom1 < numAtoms && atom2 < numAtoms && atom1 != atom2) { if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
#endif #endif
float invRSquared = 1.0f/r2; float invRSquared = 1.0f/r2;
float rScaledRadiusJ = r+obcParams2.y; float rScaledRadiusJ = r+obcParams2.y;
......
...@@ -4,10 +4,10 @@ const unsigned int TileSize = 32; ...@@ -4,10 +4,10 @@ const unsigned int TileSize = 32;
* Compute nonbonded interactions. * Compute nonbonded interactions.
*/ */
__kernel void computeNonbonded(int numAtoms, int paddedNumAtoms, __global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __kernel void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq,
__global unsigned int* exclusions, __global unsigned int* exclusionIndices, __local float4* local_posq, __local float4* local_force, __global unsigned int* tiles, __global unsigned int* exclusions, __global unsigned int* exclusionIndices, __local float4* local_posq, __local float4* local_force, __global unsigned int* tiles,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
float cutoffSquared, float4 periodicBoxSize, __global unsigned int* interactionFlags, __global unsigned int* interactionCount, __local float4* tempBuffer __global unsigned int* interactionFlags, __global unsigned int* interactionCount, __local float4* tempBuffer
#else #else
unsigned int numTiles unsigned int numTiles
#endif #endif
...@@ -41,7 +41,7 @@ __kernel void computeNonbonded(int numAtoms, int paddedNumAtoms, __global float4 ...@@ -41,7 +41,7 @@ __kernel void computeNonbonded(int numAtoms, int paddedNumAtoms, __global float4
local_posq[get_local_id(0)] = posq1; local_posq[get_local_id(0)] = posq1;
LOAD_LOCAL_PARAMETERS_FROM_1 LOAD_LOCAL_PARAMETERS_FROM_1
unsigned int xi = x/TileSize; unsigned int xi = x/TileSize;
unsigned int tile = xi+xi*paddedNumAtoms/TileSize-xi*(xi+1)/2; unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TileSize-xi*(xi+1)/2;
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[exclusionIndices[tile]+tgx]; unsigned int excl = exclusions[exclusionIndices[tile]+tgx];
#endif #endif
...@@ -51,13 +51,13 @@ __kernel void computeNonbonded(int numAtoms, int paddedNumAtoms, __global float4 ...@@ -51,13 +51,13 @@ __kernel void computeNonbonded(int numAtoms, int paddedNumAtoms, __global float4
#endif #endif
float4 delta = (float4) (local_posq[tbx+j].xyz - posq1.xyz, 0.0f); float4 delta = (float4) (local_posq[tbx+j].xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x/periodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x/PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
delta.y -= floor(delta.y/periodicBoxSize.y+0.5f)*periodicBoxSize.y; delta.y -= floor(delta.y/PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y;
delta.z -= floor(delta.z/periodicBoxSize.z+0.5f)*periodicBoxSize.z; delta.z -= floor(delta.z/PERIODIC_BOX_SIZE_Z+0.5f)*PERIODIC_BOX_SIZE_Z;
#endif #endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float r = sqrt(r2); float r = sqrt(r2);
float invR = 1.0f / r; float invR = 1.0f/r;
int atom2 = tbx+j; int atom2 = tbx+j;
float4 posq2 = local_posq[atom2]; float4 posq2 = local_posq[atom2];
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
...@@ -73,9 +73,9 @@ __kernel void computeNonbonded(int numAtoms, int paddedNumAtoms, __global float4 ...@@ -73,9 +73,9 @@ __kernel void computeNonbonded(int numAtoms, int paddedNumAtoms, __global float4
// Write results // Write results
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset = x + tgx + (x/TileSize)*paddedNumAtoms; unsigned int offset = x + tgx + (x/TileSize)*PADDED_NUM_ATOMS;
#else #else
unsigned int offset = x + tgx + warp*paddedNumAtoms; unsigned int offset = x + tgx + warp*PADDED_NUM_ATOMS;
#endif #endif
forceBuffers[offset].xyz += force.xyz; forceBuffers[offset].xyz += force.xyz;
} }
...@@ -102,13 +102,13 @@ __kernel void computeNonbonded(int numAtoms, int paddedNumAtoms, __global float4 ...@@ -102,13 +102,13 @@ __kernel void computeNonbonded(int numAtoms, int paddedNumAtoms, __global float4
bool isExcluded = false; bool isExcluded = false;
float4 delta = (float4) (local_posq[tbx+j].xyz - posq1.xyz, 0.0f); float4 delta = (float4) (local_posq[tbx+j].xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x/periodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x/PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
delta.y -= floor(delta.y/periodicBoxSize.y+0.5f)*periodicBoxSize.y; delta.y -= floor(delta.y/PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y;
delta.z -= floor(delta.z/periodicBoxSize.z+0.5f)*periodicBoxSize.z; delta.z -= floor(delta.z/PERIODIC_BOX_SIZE_Z+0.5f)*PERIODIC_BOX_SIZE_Z;
#endif #endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float r = sqrt(r2); float r = sqrt(r2);
float invR = 1.0f / r; float invR = 1.0f/r;
int atom2 = tbx+j; int atom2 = tbx+j;
float4 posq2 = local_posq[atom2]; float4 posq2 = local_posq[atom2];
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
...@@ -144,7 +144,7 @@ __kernel void computeNonbonded(int numAtoms, int paddedNumAtoms, __global float4 ...@@ -144,7 +144,7 @@ __kernel void computeNonbonded(int numAtoms, int paddedNumAtoms, __global float4
unsigned int xi = x/TileSize; unsigned int xi = x/TileSize;
unsigned int yi = y/TileSize; unsigned int yi = y/TileSize;
unsigned int tile = xi+yi*paddedNumAtoms/TileSize-yi*(yi+1)/2; unsigned int tile = xi+yi*PADDED_NUM_ATOMS/TileSize-yi*(yi+1)/2;
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndices[tile]+tgx] : 0xFFFFFFFF); unsigned int excl = (hasExclusions ? exclusions[exclusionIndices[tile]+tgx] : 0xFFFFFFFF);
excl = (excl >> tgx) | (excl << (TileSize - tgx)); excl = (excl >> tgx) | (excl << (TileSize - tgx));
...@@ -155,13 +155,13 @@ __kernel void computeNonbonded(int numAtoms, int paddedNumAtoms, __global float4 ...@@ -155,13 +155,13 @@ __kernel void computeNonbonded(int numAtoms, int paddedNumAtoms, __global float4
#endif #endif
float4 delta = (float4) (local_posq[tbx+tj].xyz - posq1.xyz, 0.0f); float4 delta = (float4) (local_posq[tbx+tj].xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x/periodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x/PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
delta.y -= floor(delta.y/periodicBoxSize.y+0.5f)*periodicBoxSize.y; delta.y -= floor(delta.y/PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y;
delta.z -= floor(delta.z/periodicBoxSize.z+0.5f)*periodicBoxSize.z; delta.z -= floor(delta.z/PERIODIC_BOX_SIZE_Z+0.5f)*PERIODIC_BOX_SIZE_Z;
#endif #endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float r = sqrt(r2); float r = sqrt(r2);
float invR = 1.0f / r; float invR = 1.0f/r;
int atom2 = tbx+tj; int atom2 = tbx+tj;
float4 posq2 = local_posq[atom2]; float4 posq2 = local_posq[atom2];
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
...@@ -180,11 +180,11 @@ __kernel void computeNonbonded(int numAtoms, int paddedNumAtoms, __global float4 ...@@ -180,11 +180,11 @@ __kernel void computeNonbonded(int numAtoms, int paddedNumAtoms, __global float4
// Write results // Write results
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset1 = x + tgx + (y/TileSize)*paddedNumAtoms; unsigned int offset1 = x + tgx + (y/TileSize)*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + (x/TileSize)*paddedNumAtoms; unsigned int offset2 = y + tgx + (x/TileSize)*PADDED_NUM_ATOMS;
#else #else
unsigned int offset1 = x + tgx + warp*paddedNumAtoms; unsigned int offset1 = x + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + warp*paddedNumAtoms; unsigned int offset2 = y + tgx + warp*PADDED_NUM_ATOMS;
#endif #endif
forceBuffers[offset1].xyz += force.xyz; forceBuffers[offset1].xyz += force.xyz;
forceBuffers[offset2].xyz += local_force[get_local_id(0)].xyz; forceBuffers[offset2].xyz += local_force[get_local_id(0)].xyz;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment