Commit 95e059dc authored by Peter Eastman's avatar Peter Eastman
Browse files

Created OpenCL implementation of GBSAOBCForce that works with non-Nvidia processors

parent f0cab877
......@@ -853,28 +853,15 @@ void OpenCLCalcGBSAOBCForceKernel::executeForces(ContextImpl& context) {
defines["USE_CUTOFF"] = "1";
if (nb.getUsePeriodic())
defines["USE_PERIODIC"] = "1";
stringstream xsize, ysize, zsize, cutoffSquared, prefac;
xsize.precision(8);
ysize.precision(8);
zsize.precision(8);
cutoffSquared.precision(8);
prefac.precision(8);
xsize << scientific << nb.getPeriodicBoxSize().x << "f";
ysize << scientific << nb.getPeriodicBoxSize().y << "f";
zsize << scientific << nb.getPeriodicBoxSize().z << "f";
cutoffSquared << scientific << (nb.getCutoffDistance()*nb.getCutoffDistance()) << "f";
prefac << scientific << prefactor << "f";
defines["PERIODIC_BOX_SIZE_X"] = xsize.str();
defines["PERIODIC_BOX_SIZE_Y"] = ysize.str();
defines["PERIODIC_BOX_SIZE_Z"] = zsize.str();
defines["CUTOFF_SQUARED"] = cutoffSquared.str();
defines["PREFACTOR"] = prefac.str();
stringstream natom, padded;
natom << cl.getNumAtoms();
padded << cl.getPaddedNumAtoms();
defines["NUM_ATOMS"] = natom.str();
defines["PADDED_NUM_ATOMS"] = padded.str();
cl::Program program = cl.createProgram(cl.loadSourceFromFile("gbsaObc.cl"), defines);
defines["PERIODIC_BOX_SIZE_X"] = doubleToString(nb.getPeriodicBoxSize().x);
defines["PERIODIC_BOX_SIZE_Y"] = doubleToString(nb.getPeriodicBoxSize().y);
defines["PERIODIC_BOX_SIZE_Z"] = doubleToString(nb.getPeriodicBoxSize().z);
defines["CUTOFF_SQUARED"] = doubleToString(nb.getCutoffDistance()*nb.getCutoffDistance());
defines["PREFACTOR"] = doubleToString(prefactor);
defines["NUM_ATOMS"] = intToString(cl.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = intToString(cl.getPaddedNumAtoms());
string filename = (cl.getSIMDWidth() == 32 ? "gbsaObc_nvidia.cl" : "gbsaObc_default.cl");
cl::Program program = cl.createProgram(cl.loadSourceFromFile(filename), defines);
computeBornSumKernel = cl::Kernel(program, "computeBornSum");
computeBornSumKernel.setArg<cl::Buffer>(0, bornSum->getDeviceBuffer());
computeBornSumKernel.setArg(1, OpenCLContext::ThreadBlockSize*sizeof(cl_float), NULL);
......@@ -882,26 +869,16 @@ void OpenCLCalcGBSAOBCForceKernel::executeForces(ContextImpl& context) {
computeBornSumKernel.setArg(3, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL);
computeBornSumKernel.setArg<cl::Buffer>(4, params->getDeviceBuffer());
computeBornSumKernel.setArg(5, OpenCLContext::ThreadBlockSize*sizeof(cl_float2), NULL);
computeBornSumKernel.setArg(6, OpenCLContext::ThreadBlockSize*sizeof(cl_float), NULL);
if (nb.getUseCutoff()) {
computeBornSumKernel.setArg<cl::Buffer>(6, nb.getInteractingTiles().getDeviceBuffer());
computeBornSumKernel.setArg<cl::Buffer>(7, nb.getInteractionFlags().getDeviceBuffer());
computeBornSumKernel.setArg<cl::Buffer>(8, nb.getInteractionCount().getDeviceBuffer());
computeBornSumKernel.setArg(9, OpenCLContext::ThreadBlockSize*sizeof(cl_float), NULL);
computeBornSumKernel.setArg<cl::Buffer>(7, nb.getInteractingTiles().getDeviceBuffer());
computeBornSumKernel.setArg<cl::Buffer>(8, nb.getInteractionFlags().getDeviceBuffer());
computeBornSumKernel.setArg<cl::Buffer>(9, nb.getInteractionCount().getDeviceBuffer());
}
else {
computeBornSumKernel.setArg<cl::Buffer>(6, nb.getTiles().getDeviceBuffer());
computeBornSumKernel.setArg<cl_uint>(7, nb.getTiles().getSize());
computeBornSumKernel.setArg<cl::Buffer>(7, nb.getTiles().getDeviceBuffer());
computeBornSumKernel.setArg<cl_uint>(8, nb.getTiles().getSize());
}
reduceBornSumKernel = cl::Kernel(program, "reduceBornSum");
reduceBornSumKernel.setArg<cl_int>(0, cl.getPaddedNumAtoms());
reduceBornSumKernel.setArg<cl_int>(1, cl.getNumForceBuffers());
reduceBornSumKernel.setArg<cl_float>(2, 1.0f);
reduceBornSumKernel.setArg<cl_float>(3, 0.8f);
reduceBornSumKernel.setArg<cl_float>(4, 4.85f);
reduceBornSumKernel.setArg<cl::Buffer>(5, bornSum->getDeviceBuffer());
reduceBornSumKernel.setArg<cl::Buffer>(6, params->getDeviceBuffer());
reduceBornSumKernel.setArg<cl::Buffer>(7, bornRadii->getDeviceBuffer());
reduceBornSumKernel.setArg<cl::Buffer>(8, obcChain->getDeviceBuffer());
force1Kernel = cl::Kernel(program, "computeGBSAForce1");
force1Kernel.setArg<cl::Buffer>(0, cl.getForceBuffers().getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(1, cl.getEnergyBuffer().getDeviceBuffer());
......@@ -912,16 +889,27 @@ void OpenCLCalcGBSAOBCForceKernel::executeForces(ContextImpl& context) {
force1Kernel.setArg(6, OpenCLContext::ThreadBlockSize*sizeof(cl_float), NULL);
force1Kernel.setArg<cl::Buffer>(7, bornForce->getDeviceBuffer());
force1Kernel.setArg(8, OpenCLContext::ThreadBlockSize*sizeof(cl_float), NULL);
force1Kernel.setArg(9, OpenCLContext::ThreadBlockSize*sizeof(mm_float4), NULL);
if (nb.getUseCutoff()) {
force1Kernel.setArg<cl::Buffer>(9, nb.getInteractingTiles().getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(10, nb.getInteractionFlags().getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(11, nb.getInteractionCount().getDeviceBuffer());
force1Kernel.setArg(12, OpenCLContext::ThreadBlockSize*sizeof(mm_float4), NULL);
force1Kernel.setArg<cl::Buffer>(10, nb.getInteractingTiles().getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(11, nb.getInteractionFlags().getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(12, nb.getInteractionCount().getDeviceBuffer());
}
else {
force1Kernel.setArg<cl::Buffer>(9, nb.getTiles().getDeviceBuffer());
force1Kernel.setArg<cl_uint>(10, nb.getTiles().getSize());
force1Kernel.setArg<cl::Buffer>(10, nb.getTiles().getDeviceBuffer());
force1Kernel.setArg<cl_uint>(11, nb.getTiles().getSize());
}
program = cl.createProgram(cl.loadSourceFromFile("gbsaObcReductions.cl"), defines);
reduceBornSumKernel = cl::Kernel(program, "reduceBornSum");
reduceBornSumKernel.setArg<cl_int>(0, cl.getPaddedNumAtoms());
reduceBornSumKernel.setArg<cl_int>(1, cl.getNumForceBuffers());
reduceBornSumKernel.setArg<cl_float>(2, 1.0f);
reduceBornSumKernel.setArg<cl_float>(3, 0.8f);
reduceBornSumKernel.setArg<cl_float>(4, 4.85f);
reduceBornSumKernel.setArg<cl::Buffer>(5, bornSum->getDeviceBuffer());
reduceBornSumKernel.setArg<cl::Buffer>(6, params->getDeviceBuffer());
reduceBornSumKernel.setArg<cl::Buffer>(7, bornRadii->getDeviceBuffer());
reduceBornSumKernel.setArg<cl::Buffer>(8, obcChain->getDeviceBuffer());
reduceBornForceKernel = cl::Kernel(program, "reduceBornForce");
reduceBornForceKernel.setArg<cl_int>(0, cl.getPaddedNumAtoms());
reduceBornForceKernel.setArg<cl_int>(1, cl.getNumForceBuffers());
......
......@@ -300,7 +300,7 @@ private:
class OpenCLCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
public:
OpenCLCalcNonbondedForceKernel(std::string name, const Platform& platform, OpenCLContext& cl, System& system) : CalcNonbondedForceKernel(name, platform), cl(cl),
exceptionParams(NULL), exceptionIndices(NULL) {
sigmaEpsilon(NULL), exceptionParams(NULL), exceptionIndices(NULL), cosSinSums(NULL) {
}
~OpenCLCalcNonbondedForceKernel();
/**
......
......@@ -107,6 +107,14 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
if (cutoff == -1.0)
return; // There are no nonbonded interactions in the System.
if (atomExclusions.size() == 0) {
// No exclusions were specifically requested, so just mark every atom as not interacting with itself.
atomExclusions.resize(context.getNumAtoms());
for (int i = 0; i < atomExclusions.size(); i++)
atomExclusions[i].push_back(i);
}
// Create the list of tiles.
int numAtomBlocks = context.getNumAtomBlocks();
......
const float dielectricOffset = 0.009f;
const float probeRadius = 0.14f;
const float surfaceAreaFactor = -6.0f*3.14159265358979323846f*0.0216f*1000.0f*0.4184f;
/**
* Reduce the Born sums to compute the Born radii.
*/
__kernel void reduceBornSum(int bufferSize, int numBuffers, float alpha, float beta, float gamma,
__global float* bornSum, __global float2* params, __global float* bornRadii, __global float* obcChain) {
unsigned int index = get_global_id(0);
while (index < NUM_ATOMS) {
// Get summed Born data
int totalSize = bufferSize*numBuffers;
float sum = bornSum[index];
for (int i = index+bufferSize; i < totalSize; i += bufferSize)
sum += bornSum[i];
// Now calculate Born radius and OBC term.
float offsetRadius = params[index].x;
sum *= 0.5f*offsetRadius;
float sum2 = sum*sum;
float sum3 = sum*sum2;
float tanhSum = tanh(alpha*sum - beta*sum2 + gamma*sum3);
float nonOffsetRadius = offsetRadius + dielectricOffset;
float radius = 1.0f/(1.0f/offsetRadius - tanhSum/nonOffsetRadius);
float chain = offsetRadius*(alpha - 2.0f*beta*sum + 3.0f*gamma*sum2);
chain = (1.0f-tanhSum*tanhSum)*chain / nonOffsetRadius;
bornRadii[index] = radius;
obcChain[index] = chain;
index += get_global_size(0);
}
}
/**
* Reduce the Born force.
*/
__kernel void reduceBornForce(int bufferSize, int numBuffers, __global float* bornForce, __global float* energyBuffer,
__global float2* params, __global float* bornRadii, __global float* obcChain) {
float energy = 0.0f;
unsigned int index = get_global_id(0);
while (index < NUM_ATOMS) {
// Sum the Born force
int totalSize = bufferSize*numBuffers;
float force = bornForce[index];
for (int i = index+bufferSize; i < totalSize; i += bufferSize)
force += bornForce[i];
// Now calculate the actual force
float offsetRadius = params[index].x;
float bornRadius = bornRadii[index];
float r = offsetRadius+dielectricOffset+probeRadius;
float ratio6 = pow((offsetRadius+dielectricOffset)/bornRadius, 6.0f);
float saTerm = surfaceAreaFactor*r*r*ratio6;
force += saTerm/bornRadius;
energy += saTerm;
force *= bornRadius*bornRadius*obcChain[index];
bornForce[index] = force;
index += get_global_size(0);
}
energyBuffer[get_global_id(0)] += energy/-6.0f;
}
\ No newline at end of file
#define TILE_SIZE 32
/**
* Compute the Born sum.
*/
__kernel void computeBornSum(__global float* global_bornSum, __local float* local_bornSum, __global float4* posq,
__local float4* local_posq, __global float2* global_params, __local float2* local_params, __local float* tempBuffer, __global unsigned int* tiles,
#ifdef USE_CUTOFF
__global unsigned int* interactionFlags, __global unsigned int* interactionCount) {
#else
unsigned int numTiles) {
#endif
#ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0];
#endif
unsigned int pos = get_group_id(0)*numTiles/get_num_groups(0);
unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF;
while (pos < end) {
// Extract the coordinates of this tile
unsigned int x = tiles[pos];
unsigned int y = ((x >> 2) & 0x7fff)*TILE_SIZE;
x = (x>>17)*TILE_SIZE;
unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2);
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int forceBufferOffset = (tgx < TILE_SIZE/2 ? 0 : TILE_SIZE);
unsigned int atom1 = x + tgx;
float bornSum = 0.0f;
float4 posq1 = posq[atom1];
float2 params1 = global_params[atom1];
if (x == y) {
// This tile is on the diagonal.
local_posq[get_local_id(0)] = posq1;
local_params[get_local_id(0)] = params1;
barrier(CLK_LOCAL_MEM_FENCE);
unsigned int xi = x/TILE_SIZE;
unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*(xi+1)/2;
for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
float4 delta = (float4) (local_posq[baseLocalAtom+j].xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x/PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
delta.y -= floor(delta.y/PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y;
delta.z -= floor(delta.z/PERIODIC_BOX_SIZE_Z+0.5f)*PERIODIC_BOX_SIZE_Z;
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (atom1 < NUM_ATOMS && y+baseLocalAtom+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
if (atom1 < NUM_ATOMS && y+baseLocalAtom+j < NUM_ATOMS) {
#endif
float r = sqrt(r2);
float invR = 1.0f/r;
float2 params2 = local_params[baseLocalAtom+j];
float rScaledRadiusJ = r+params2.y;
if ((j != tgx) && (params1.x < rScaledRadiusJ)) {
float l_ij = 1.0f/max(params1.x, fabs(r-params2.y));
float u_ij = 1.0f/rScaledRadiusJ;
float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij;
float ratio = log(u_ij / l_ij);
bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2);
if (params1.x < params2.x-r)
bornSum += 2.0f*(1.0f/params1.x-l_ij);
}
}
}
// Sum the forces and write results.
if (get_local_id(0) >= TILE_SIZE)
tempBuffer[get_local_id(0)] = bornSum;
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) {
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset = x + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS;
#else
unsigned int offset = x + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
global_bornSum[offset] += bornSum+tempBuffer[get_local_id(0)+TILE_SIZE];
}
}
else {
// This is an off-diagonal tile.
if (lasty != y && get_local_id(0) < TILE_SIZE) {
unsigned int j = y + tgx;
local_posq[get_local_id(0)] = posq[j];
local_params[get_local_id(0)] = global_params[j];
}
local_bornSum[get_local_id(0)] = 0.0f;
barrier(CLK_LOCAL_MEM_FENCE);
// Compute the full set of interactions in this tile.
unsigned int xi = x/TILE_SIZE;
unsigned int yi = y/TILE_SIZE;
unsigned int tile = xi+yi*PADDED_NUM_ATOMS/TILE_SIZE-yi*(yi+1)/2;
unsigned int tj = tgx%(TILE_SIZE/2);
for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
float4 delta = (float4) (local_posq[baseLocalAtom+tj].xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x/PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
delta.y -= floor(delta.y/PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y;
delta.z -= floor(delta.z/PERIODIC_BOX_SIZE_Z+0.5f)*PERIODIC_BOX_SIZE_Z;
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (atom1 < NUM_ATOMS && y+baseLocalAtom+tj < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
if (atom1 < NUM_ATOMS && y+baseLocalAtom+tj < NUM_ATOMS) {
#endif
float r = sqrt(r2);
float invR = 1.0f/r;
float2 params2 = local_params[baseLocalAtom+tj];
float rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) {
float l_ij = 1.0f/max(params1.x, fabs(r-params2.y));
float u_ij = 1.0f/rScaledRadiusJ;
float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij;
float ratio = log(u_ij / l_ij);
bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2);
if (params1.x < params2.x-r)
bornSum += 2.0f*(1.0f/params1.x-l_ij);
}
float rScaledRadiusI = r+params1.y;
if (params2.x < rScaledRadiusJ) {
float l_ij = 1.0f/max(params2.x, fabs(r-params1.y));
float u_ij = 1.0f/rScaledRadiusJ;
float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij;
float ratio = log(u_ij / l_ij);
float term = l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2);
if (params2.x < params1.x-r)
term += 2.0f*(1.0f/params2.x-l_ij);
local_bornSum[baseLocalAtom+tj+forceBufferOffset] += term;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
tj = (tj+1)%(TILE_SIZE/2);
}
// Sum the forces and write results.
if (get_local_id(0) >= TILE_SIZE)
tempBuffer[get_local_id(0)] = bornSum;
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) {
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset1 = x + tgx + (y/TILE_SIZE)*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS;
#else
unsigned int offset1 = x + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
global_bornSum[offset1] += bornSum+tempBuffer[get_local_id(0)+TILE_SIZE];
global_bornSum[offset2] += local_bornSum[get_local_id(0)]+local_bornSum[get_local_id(0)+TILE_SIZE];
}
lasty = y;
}
pos++;
}
}
/**
* First part of computing the GBSA interaction.
*/
__kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer,
__global float4* posq, __local float4* local_posq, __local float4* local_force, __global float* global_bornRadii, __local float* local_bornRadii,
__global float* global_bornForce, __local float* local_bornForce, __local float4* tempBuffer, __global unsigned int* tiles,
#ifdef USE_CUTOFF
__global unsigned int* interactionFlags, __global unsigned int* interactionCount) {
#else
unsigned int numTiles) {
#endif
#ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0];
#endif
unsigned int pos = get_group_id(0)*numTiles/get_num_groups(0);
unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF;
while (pos < end) {
// Extract the coordinates of this tile
unsigned int x = tiles[pos];
unsigned int y = ((x >> 2) & 0x7fff)*TILE_SIZE;
x = (x>>17)*TILE_SIZE;
unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2);
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int forceBufferOffset = (tgx < TILE_SIZE/2 ? 0 : TILE_SIZE);
unsigned int atom1 = x + tgx;
float4 force = 0.0f;
float4 posq1 = posq[atom1];
float bornRadius1 = global_bornRadii[atom1];
if (x == y) {
// This tile is on the diagonal.
local_posq[get_local_id(0)] = posq1;
local_bornRadii[get_local_id(0)] = bornRadius1;
barrier(CLK_LOCAL_MEM_FENCE);
unsigned int xi = x/TILE_SIZE;
unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*(xi+1)/2;
for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
if (atom1 < NUM_ATOMS && y+baseLocalAtom+j < NUM_ATOMS) {
float4 posq2 = local_posq[baseLocalAtom+j];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x/PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
delta.y -= floor(delta.y/PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y;
delta.z -= floor(delta.z/PERIODIC_BOX_SIZE_Z+0.5f)*PERIODIC_BOX_SIZE_Z;
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float r = sqrt(r2);
float invR = 1.0f/r;
float bornRadius2 = local_bornRadii[baseLocalAtom+j];
float alpha2_ij = bornRadius1*bornRadius2;
float D_ij = r2/(4.0f*alpha2_ij);
float expTerm = exp(-D_ij);
float denominator2 = r2 + alpha2_ij*expTerm;
float denominator = sqrt(denominator2);
float tempEnergy = (PREFACTOR*posq1.w*posq2.w)/denominator;
float Gpol = tempEnergy/denominator2;
float dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
force.w += dGpol_dalpha2_ij*bornRadius2;
float dEdR = Gpol*(1.0f - 0.25f*expTerm);
#ifdef USE_CUTOFF
if (r2 > CUTOFF_SQUARED) {
dEdR = 0.0f;
tempEnergy = 0.0f;
}
#endif
energy += 0.5f*tempEnergy;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
}
}
// Sum the forces and write results.
if (get_local_id(0) >= TILE_SIZE)
tempBuffer[get_local_id(0)] = force;
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) {
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset = x + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS;
#else
unsigned int offset = x + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz+tempBuffer[get_local_id(0)+TILE_SIZE].xyz;
global_bornForce[offset] += force.w+tempBuffer[get_local_id(0)+TILE_SIZE].w;
}
}
else {
// This is an off-diagonal tile.
if (lasty != y && get_local_id(0) < TILE_SIZE) {
unsigned int j = y + tgx;
local_posq[get_local_id(0)] = posq[j];
local_bornRadii[get_local_id(0)] = global_bornRadii[j];
}
local_force[get_local_id(0)] = 0.0f;
barrier(CLK_LOCAL_MEM_FENCE);
// Compute the full set of interactions in this tile.
unsigned int xi = x/TILE_SIZE;
unsigned int yi = y/TILE_SIZE;
unsigned int tile = xi+yi*PADDED_NUM_ATOMS/TILE_SIZE-yi*(yi+1)/2;
unsigned int tj = tgx%(TILE_SIZE/2);
for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
if (atom1 < NUM_ATOMS && y+baseLocalAtom+tj < NUM_ATOMS) {
float4 posq2 = local_posq[baseLocalAtom+tj];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x/PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
delta.y -= floor(delta.y/PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y;
delta.z -= floor(delta.z/PERIODIC_BOX_SIZE_Z+0.5f)*PERIODIC_BOX_SIZE_Z;
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float r = sqrt(r2);
float invR = 1.0f/r;
float bornRadius2 = local_bornRadii[baseLocalAtom+tj];
float alpha2_ij = bornRadius1*bornRadius2;
float D_ij = r2/(4.0f*alpha2_ij);
float expTerm = exp(-D_ij);
float denominator2 = r2 + alpha2_ij*expTerm;
float denominator = sqrt(denominator2);
float tempEnergy = (PREFACTOR*posq1.w*posq2.w)/denominator;
float Gpol = tempEnergy/denominator2;
float dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
force.w += dGpol_dalpha2_ij*bornRadius2;
float dEdR = Gpol*(1.0f - 0.25f*expTerm);
#ifdef USE_CUTOFF
if (r2 > CUTOFF_SQUARED) {
dEdR = 0.0f;
tempEnergy = 0.0f;
}
#endif
energy += tempEnergy;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
local_force[baseLocalAtom+tj+forceBufferOffset] += (float4) (delta.xyz, dGpol_dalpha2_ij*bornRadius1);
}
barrier(CLK_LOCAL_MEM_FENCE);
tj = (tj+1)%(TILE_SIZE/2);
}
// Sum the forces and write results.
if (get_local_id(0) >= TILE_SIZE)
tempBuffer[get_local_id(0)] = force;
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) {
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset1 = x + tgx + (y/TILE_SIZE)*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS;
#else
unsigned int offset1 = x + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
forceBuffers[offset1].xyz = forceBuffers[offset1].xyz+force.xyz+tempBuffer[get_local_id(0)+TILE_SIZE].xyz;
forceBuffers[offset2].xyz = forceBuffers[offset2].xyz+local_force[get_local_id(0)].xyz+local_force[get_local_id(0)+TILE_SIZE].xyz;
global_bornForce[offset1] += force.w+tempBuffer[get_local_id(0)+TILE_SIZE].w;
global_bornForce[offset2] += local_force[get_local_id(0)].w+local_force[get_local_id(0)+TILE_SIZE].w;
}
lasty = y;
}
pos++;
}
energyBuffer[get_global_id(0)] += energy;
}
const unsigned int TileSize = 32;
const float dielectricOffset = 0.009f;
const float probeRadius = 0.14f;
const float surfaceAreaFactor = -6.0f*3.14159265358979323846f*0.0216f*1000.0f*0.4184f;
#define TILE_SIZE 32
/**
* Compute the Born sum.
*/
__kernel void computeBornSum(__global float* global_bornSum, __local float* local_bornSum, __global float4* posq,
__local float4* local_posq, __global float2* global_params, __local float2* local_params, __global unsigned int* tiles,
__local float4* local_posq, __global float2* global_params, __local float2* local_params, __local float* tempBuffer, __global unsigned int* tiles,
#ifdef USE_CUTOFF
__global unsigned int* interactionFlags, __global unsigned int* interactionCount, __local float* tempBuffer) {
__global unsigned int* interactionFlags, __global unsigned int* interactionCount) {
#else
unsigned int numTiles) {
#endif
#ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0];
#endif
unsigned int totalWarps = get_global_size(0)/TileSize;
unsigned int warp = get_global_id(0)/TileSize;
unsigned int totalWarps = get_global_size(0)/TILE_SIZE;
unsigned int warp = get_global_id(0)/TILE_SIZE;
unsigned int pos = warp*numTiles/totalWarps;
unsigned int end = (warp+1)*numTiles/totalWarps;
float energy = 0.0f;
......@@ -27,11 +24,10 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
while (pos < end) {
// Extract the coordinates of this tile
unsigned int x = tiles[pos];
unsigned int y = ((x >> 2) & 0x7fff)*TileSize;
x = (x>>17)*TileSize;
unsigned int tgx = get_local_id(0) & (TileSize-1);
unsigned int y = ((x >> 2) & 0x7fff)*TILE_SIZE;
x = (x>>17)*TILE_SIZE;
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int tbx = get_local_id(0) - tgx;
unsigned int tj = tgx;
unsigned int atom1 = x + tgx;
float bornSum = 0.0f;
float4 posq1 = posq[atom1];
......@@ -41,9 +37,9 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
local_posq[get_local_id(0)] = posq1;
local_params[get_local_id(0)] = params1;
unsigned int xi = x/TileSize;
unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TileSize-xi*(xi+1)/2;
for (unsigned int j = 0; j < TileSize; j++) {
unsigned int xi = x/TILE_SIZE;
unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*(xi+1)/2;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
float4 delta = (float4) (local_posq[tbx+j].xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x/PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
......@@ -76,7 +72,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset = x + tgx + (x/TileSize)*PADDED_NUM_ATOMS;
unsigned int offset = x + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS;
#else
unsigned int offset = x + tgx + warp*PADDED_NUM_ATOMS;
#endif
......@@ -100,7 +96,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
else {
// Compute only a subset of the interactions in this tile.
for (unsigned int j = 0; j < TileSize; j++) {
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) {
float4 delta = (float4) (local_posq[tbx+j].xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
......@@ -166,10 +162,11 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
{
// Compute the full set of interactions in this tile.
unsigned int xi = x/TileSize;
unsigned int yi = y/TileSize;
unsigned int tile = xi+yi*PADDED_NUM_ATOMS/TileSize-yi*(yi+1)/2;
for (unsigned int j = 0; j < TileSize; j++) {
unsigned int xi = x/TILE_SIZE;
unsigned int yi = y/TILE_SIZE;
unsigned int tile = xi+yi*PADDED_NUM_ATOMS/TILE_SIZE-yi*(yi+1)/2;
unsigned int tj = tgx;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
float4 delta = (float4) (local_posq[tbx+tj].xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x/PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
......@@ -211,15 +208,15 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
local_bornSum[tbx+tj] += term;
}
}
tj = (tj + 1) & (TileSize - 1);
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
// Write results
float4 of;
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset1 = x + tgx + (y/TileSize)*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + (x/TileSize)*PADDED_NUM_ATOMS;
unsigned int offset1 = x + tgx + (y/TILE_SIZE)*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS;
#else
unsigned int offset1 = x + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + warp*PADDED_NUM_ATOMS;
......@@ -232,55 +229,23 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
}
}
/**
* Reduce the Born sums to compute the Born radii.
*/
__kernel void reduceBornSum(int bufferSize, int numBuffers, float alpha, float beta, float gamma,
__global float* bornSum, __global float2* params, __global float* bornRadii, __global float* obcChain) {
unsigned int index = get_global_id(0);
while (index < NUM_ATOMS) {
// Get summed Born data
int totalSize = bufferSize*numBuffers;
float sum = bornSum[index];
for (int i = index+bufferSize; i < totalSize; i += bufferSize)
sum += bornSum[i];
// Now calculate Born radius and OBC term.
float offsetRadius = params[index].x;
sum *= 0.5f*offsetRadius;
float sum2 = sum*sum;
float sum3 = sum*sum2;
float tanhSum = tanh(alpha*sum - beta*sum2 + gamma*sum3);
float nonOffsetRadius = offsetRadius + dielectricOffset;
float radius = 1.0f/(1.0f/offsetRadius - tanhSum/nonOffsetRadius);
float chain = offsetRadius*(alpha - 2.0f*beta*sum + 3.0f*gamma*sum2);
chain = (1.0f-tanhSum*tanhSum)*chain / nonOffsetRadius;
bornRadii[index] = radius;
obcChain[index] = chain;
index += get_global_size(0);
}
}
/**
* First part of computing the GBSA interaction.
*/
__kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer,
__global float4* posq, __local float4* local_posq, __local float4* local_force, __global float* global_bornRadii, __local float* local_bornRadii,
__global float* global_bornForce, __local float* local_bornForce, __global unsigned int* tiles,
__global float* global_bornForce, __local float* local_bornForce, __local float4* tempBuffer, __global unsigned int* tiles,
#ifdef USE_CUTOFF
__global unsigned int* interactionFlags, __global unsigned int* interactionCount, __local float4* tempBuffer) {
__global unsigned int* interactionFlags, __global unsigned int* interactionCount) {
#else
unsigned int numTiles) {
#endif
#ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0];
#endif
unsigned int totalWarps = get_global_size(0)/TileSize;
unsigned int warp = get_global_id(0)/TileSize;
unsigned int totalWarps = get_global_size(0)/TILE_SIZE;
unsigned int warp = get_global_id(0)/TILE_SIZE;
unsigned int pos = warp*numTiles/totalWarps;
unsigned int end = (warp+1)*numTiles/totalWarps;
float energy = 0.0f;
......@@ -289,11 +254,10 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
while (pos < end) {
// Extract the coordinates of this tile
unsigned int x = tiles[pos];
unsigned int y = ((x >> 2) & 0x7fff)*TileSize;
x = (x>>17)*TileSize;
unsigned int tgx = get_local_id(0) & (TileSize-1);
unsigned int y = ((x >> 2) & 0x7fff)*TILE_SIZE;
x = (x>>17)*TILE_SIZE;
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int tbx = get_local_id(0) - tgx;
unsigned int tj = tgx;
unsigned int atom1 = x + tgx;
float4 force = 0.0f;
float4 posq1 = posq[atom1];
......@@ -303,11 +267,12 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
local_posq[get_local_id(0)] = posq1;
local_bornRadii[get_local_id(0)] = bornRadius1;
unsigned int xi = x/TileSize;
unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TileSize-xi*(xi+1)/2;
for (unsigned int j = 0; j < TileSize; j++) {
unsigned int xi = x/TILE_SIZE;
unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*(xi+1)/2;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if (atom1 < NUM_ATOMS && y+j < NUM_ATOMS) {
float4 delta = (float4) (local_posq[tbx+j].xyz - posq1.xyz, 0.0f);
float4 posq2 = local_posq[tbx+j];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x/PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
delta.y -= floor(delta.y/PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y;
......@@ -316,7 +281,6 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float r = sqrt(r2);
float invR = 1.0f/r;
float4 posq2 = local_posq[tbx+j];
float bornRadius2 = local_bornRadii[tbx+j];
float alpha2_ij = bornRadius1*bornRadius2;
float D_ij = r2/(4.0f*alpha2_ij);
......@@ -342,7 +306,7 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset = x + tgx + (x/TileSize)*PADDED_NUM_ATOMS;
unsigned int offset = x + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS;
#else
unsigned int offset = x + tgx + warp*PADDED_NUM_ATOMS;
#endif
......@@ -367,9 +331,10 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
else {
// Compute only a subset of the interactions in this tile.
for (unsigned int j = 0; j < TileSize; j++) {
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) {
float4 delta = (float4) (local_posq[tbx+j].xyz - posq1.xyz, 0.0f);
float4 posq2 = local_posq[tbx+j];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x/PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
delta.y -= floor(delta.y/PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y;
......@@ -377,8 +342,7 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float r = sqrt(r2);
float invR = 1.0f / r;
float4 posq2 = local_posq[tbx+j];
float invR = 1.0f/r;
float bornRadius2 = local_bornRadii[tbx+j];
float alpha2_ij = bornRadius1*bornRadius2;
float D_ij = r2/(4.0f*alpha2_ij);
......@@ -424,12 +388,14 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
{
// Compute the full set of interactions in this tile.
unsigned int xi = x/TileSize;
unsigned int yi = y/TileSize;
unsigned int tile = xi+yi*PADDED_NUM_ATOMS/TileSize-yi*(yi+1)/2;
for (unsigned int j = 0; j < TileSize; j++) {
unsigned int xi = x/TILE_SIZE;
unsigned int yi = y/TILE_SIZE;
unsigned int tile = xi+yi*PADDED_NUM_ATOMS/TILE_SIZE-yi*(yi+1)/2;
unsigned int tj = tgx;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if (atom1 < NUM_ATOMS && y+tj < NUM_ATOMS) {
float4 delta = (float4) (local_posq[tbx+tj].xyz - posq1.xyz, 0.0f);
float4 posq2 = local_posq[tbx+tj];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x/PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
delta.y -= floor(delta.y/PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y;
......@@ -437,8 +403,7 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float r = sqrt(r2);
float invR = 1.0f / r;
float4 posq2 = local_posq[tbx+tj];
float invR = 1.0f/r;
float bornRadius2 = local_bornRadii[tbx+tj];
float alpha2_ij = bornRadius1*bornRadius2;
float D_ij = r2/(4.0f*alpha2_ij);
......@@ -461,14 +426,14 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
force.xyz -= delta.xyz;
local_force[tbx+tj] += (float4) (delta.xyz, dGpol_dalpha2_ij*bornRadius1);
}
tj = (tj + 1) & (TileSize - 1);
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset1 = x + tgx + (y/TileSize)*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + (x/TileSize)*PADDED_NUM_ATOMS;
unsigned int offset1 = x + tgx + (y/TILE_SIZE)*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS;
#else
unsigned int offset1 = x + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + warp*PADDED_NUM_ATOMS;
......@@ -483,35 +448,3 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
}
energyBuffer[get_global_id(0)] += energy;
}
/**
* Reduce the Born force.
*/
__kernel void reduceBornForce(int bufferSize, int numBuffers, __global float* bornForce, __global float* energyBuffer,
__global float2* params, __global float* bornRadii, __global float* obcChain) {
float energy = 0.0f;
unsigned int index = get_global_id(0);
while (index < NUM_ATOMS) {
// Sum the Born force
int totalSize = bufferSize*numBuffers;
float force = bornForce[index];
for (int i = index+bufferSize; i < totalSize; i += bufferSize)
force += bornForce[i];
// Now calculate the actual force
float offsetRadius = params[index].x;
float bornRadius = bornRadii[index];
float r = offsetRadius+dielectricOffset+probeRadius;
float ratio6 = pow((offsetRadius+dielectricOffset)/bornRadius, 6.0f);
float saTerm = surfaceAreaFactor*r*r*ratio6;
force += saTerm/bornRadius;
energy += saTerm;
force *= bornRadius*bornRadius*obcChain[index];
bornForce[index] = force;
index += get_global_size(0);
}
energyBuffer[get_global_id(0)] += energy/-6.0f;
}
const unsigned int TileSize = 32;
#define TILE_SIZE 32
/**
* Compute nonbonded interactions.
......@@ -15,8 +15,8 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
#ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0];
#endif
unsigned int totalWarps = get_global_size(0)/TileSize;
unsigned int warp = get_global_id(0)/TileSize;
unsigned int totalWarps = get_global_size(0)/TILE_SIZE;
unsigned int warp = get_global_id(0)/TILE_SIZE;
unsigned int pos = warp*numTiles/totalWarps;
unsigned int end = (warp+1)*numTiles/totalWarps;
float energy = 0.0f;
......@@ -25,10 +25,10 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
while (pos < end) {
// Extract the coordinates of this tile
unsigned int x = tiles[pos];
unsigned int y = ((x >> 2) & 0x7fff)*TileSize;
unsigned int y = ((x >> 2) & 0x7fff)*TILE_SIZE;
bool hasExclusions = (x & 0x1);
x = (x>>17)*TileSize;
unsigned int tgx = get_local_id(0) & (TileSize-1);
x = (x>>17)*TILE_SIZE;
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int tbx = get_local_id(0) - tgx;
unsigned int atom1 = x + tgx;
float4 force = 0.0f;
......@@ -39,12 +39,12 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
local_posq[get_local_id(0)] = posq1;
LOAD_LOCAL_PARAMETERS_FROM_1
unsigned int xi = x/TileSize;
unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TileSize-xi*(xi+1)/2;
unsigned int xi = x/TILE_SIZE;
unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*(xi+1)/2;
#ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[exclusionIndices[tile]+tgx];
#endif
for (unsigned int j = 0; j < TileSize; j++) {
for (unsigned int j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
......@@ -72,7 +72,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset = x + tgx + (x/TileSize)*PADDED_NUM_ATOMS;
unsigned int offset = x + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS;
#else
unsigned int offset = x + tgx + warp*PADDED_NUM_ATOMS;
#endif
......@@ -96,7 +96,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
else {
// Compute only a subset of the interactions in this tile.
for (unsigned int j = 0; j < TileSize; j++) {
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) {
bool isExcluded = false;
int atom2 = tbx+j;
......@@ -141,15 +141,15 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
{
// Compute the full set of interactions in this tile.
unsigned int xi = x/TileSize;
unsigned int yi = y/TileSize;
unsigned int tile = xi+yi*PADDED_NUM_ATOMS/TileSize-yi*(yi+1)/2;
unsigned int xi = x/TILE_SIZE;
unsigned int yi = y/TILE_SIZE;
unsigned int tile = xi+yi*PADDED_NUM_ATOMS/TILE_SIZE-yi*(yi+1)/2;
#ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndices[tile]+tgx] : 0xFFFFFFFF);
excl = (excl >> tgx) | (excl << (TileSize - tgx));
excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
#endif
unsigned int tj = tgx;
for (unsigned int j = 0; j < TileSize; j++) {
for (unsigned int j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
......@@ -174,14 +174,14 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
force.xyz -= delta.xyz;
local_force[tbx+tj].xyz += delta.xyz;
excl >>= 1;
tj = (tj + 1) & (TileSize - 1);
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset1 = x + tgx + (y/TileSize)*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + (x/TileSize)*PADDED_NUM_ATOMS;
unsigned int offset1 = x + tgx + (y/TILE_SIZE)*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS;
#else
unsigned int offset1 = x + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + warp*PADDED_NUM_ATOMS;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment