Commit 9a4e2b02 authored by Peter Eastman's avatar Peter Eastman
Browse files

Beginning of changes to reduce memory use for large systems

parent 2bdb84bd
......@@ -1690,6 +1690,7 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
defines["PREFACTOR"] = doubleToString(prefactor);
defines["NUM_ATOMS"] = intToString(cl.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = intToString(cl.getPaddedNumAtoms());
defines["NUM_BLOCKS"] = OpenCLExpressionUtilities::intToString(cl.getNumAtomBlocks());
string file = (cl.getSIMDWidth() == 32 ? OpenCLKernelSources::gbsaObc_nvidia : OpenCLKernelSources::gbsaObc_default);
cl::Program program = cl.createProgram(file, defines);
int index = 0;
......@@ -1985,6 +1986,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
defines["CUTOFF_SQUARED"] = doubleToString(force.getCutoffDistance()*force.getCutoffDistance());
defines["NUM_ATOMS"] = intToString(cl.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = intToString(cl.getPaddedNumAtoms());
defines["NUM_BLOCKS"] = OpenCLExpressionUtilities::intToString(cl.getNumAtomBlocks());
string file = (cl.getSIMDWidth() == 32 ? OpenCLKernelSources::customGBValueN2_nvidia : OpenCLKernelSources::customGBValueN2_default);
cl::Program program = cl.createProgram(cl.replaceStrings(file, replacements), defines);
pairValueKernel = cl::Kernel(program, "computeN2Value");
......@@ -2131,6 +2133,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
defines["CUTOFF_SQUARED"] = doubleToString(force.getCutoffDistance()*force.getCutoffDistance());
defines["NUM_ATOMS"] = intToString(cl.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = intToString(cl.getPaddedNumAtoms());
defines["NUM_BLOCKS"] = OpenCLExpressionUtilities::intToString(cl.getNumAtomBlocks());
string file = (cl.getSIMDWidth() == 32 ? OpenCLKernelSources::customGBEnergyN2_nvidia : OpenCLKernelSources::customGBEnergyN2_default);
cl::Program program = cl.createProgram(cl.replaceStrings(file, replacements), defines);
pairEnergyKernel = cl::Kernel(program, "computeN2Energy");
......@@ -2393,6 +2396,7 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
pairValueKernel.setArg(index++, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL);
pairValueKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusions().getDeviceBuffer());
pairValueKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionIndices().getDeviceBuffer());
pairValueKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionRowIndices().getDeviceBuffer());
pairValueKernel.setArg<cl::Buffer>(index++, valueBuffers->getDeviceBuffer());
pairValueKernel.setArg(index++, OpenCLContext::ThreadBlockSize*sizeof(cl_float), NULL);
pairValueKernel.setArg(index++, OpenCLContext::ThreadBlockSize*sizeof(cl_float), NULL);
......@@ -2442,6 +2446,7 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
pairEnergyKernel.setArg(index++, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL);
pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusions().getDeviceBuffer());
pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionIndices().getDeviceBuffer());
pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionRowIndices().getDeviceBuffer());
pairEnergyKernel.setArg(index++, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL);
if (nb.getUseCutoff()) {
pairEnergyKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
......@@ -2520,10 +2525,10 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
globals->upload(globalParamValues);
}
if (nb.getUseCutoff()) {
pairValueKernel.setArg<mm_float4>(10, cl.getPeriodicBoxSize());
pairValueKernel.setArg<mm_float4>(11, cl.getInvPeriodicBoxSize());
pairEnergyKernel.setArg<mm_float4>(11, cl.getPeriodicBoxSize());
pairEnergyKernel.setArg<mm_float4>(12, cl.getInvPeriodicBoxSize());
pairValueKernel.setArg<mm_float4>(11, cl.getPeriodicBoxSize());
pairValueKernel.setArg<mm_float4>(12, cl.getInvPeriodicBoxSize());
pairEnergyKernel.setArg<mm_float4>(12, cl.getPeriodicBoxSize());
pairEnergyKernel.setArg<mm_float4>(13, cl.getInvPeriodicBoxSize());
}
cl.executeKernel(pairValueKernel, nb.getTiles().getSize()*OpenCLContext::TileSize);
cl.executeKernel(perParticleValueKernel, cl.getPaddedNumAtoms());
......
......@@ -30,12 +30,14 @@
#include "OpenCLKernelSources.h"
#include "OpenCLExpressionUtilities.h"
#include <map>
#include <set>
#include <utility>
using namespace OpenMM;
using namespace std;
OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : context(context), cutoff(-1.0), useCutoff(false),
numForceBuffers(0), tiles(NULL), exclusionIndex(NULL), exclusions(NULL), interactingTiles(NULL), interactionFlags(NULL),
numForceBuffers(0), tiles(NULL), exclusionIndices(NULL), exclusionRowIndices(NULL), exclusions(NULL), interactingTiles(NULL), interactionFlags(NULL),
interactionCount(NULL), blockCenter(NULL), blockBoundingBox(NULL), compact(NULL) {
// Decide how many force buffers to use.
......@@ -52,8 +54,10 @@ OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : con
OpenCLNonbondedUtilities::~OpenCLNonbondedUtilities() {
if (tiles != NULL)
delete tiles;
if (exclusionIndex != NULL)
delete exclusionIndex;
if (exclusionIndices != NULL)
delete exclusionIndices;
if (exclusionRowIndices != NULL)
delete exclusionRowIndices;
if (exclusions != NULL)
delete exclusions;
if (interactingTiles != NULL)
......@@ -153,16 +157,36 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
// Build a list of indices for the tiles with exclusions.
exclusionIndex = new OpenCLArray<cl_uint>(context, numTiles, "exclusionIndex");
vector<cl_uint> exclusionIndexVec(exclusionIndex->getSize());
int numWithExclusions = 0;
for (int i = 0; i < numTiles; ++i)
if ((tileVec[i]&1) == 1)
exclusionIndexVec[i] = (numWithExclusions++)*OpenCLContext::TileSize;
set<pair<int, int> > tilesWithExclusions;
for (int atom1 = 0; atom1 < (int) atomExclusions.size(); ++atom1) {
int x = atom1/OpenCLContext::TileSize;
for (int j = 0; j < (int) atomExclusions[atom1].size(); ++j) {
int atom2 = atomExclusions[atom1][j];
int y = atom2/OpenCLContext::TileSize;
tilesWithExclusions.insert(make_pair(max(x, y), min(x, y)));
}
}
if (context.getPaddedNumAtoms() > context.getNumAtoms()) {
for (int i = 0; i < numAtomBlocks; ++i)
tilesWithExclusions.insert(make_pair(numAtomBlocks-1, i));
}
vector<cl_uint> exclusionRowIndicesVec(numAtomBlocks+1, 0);
vector<cl_uint> exclusionIndicesVec;
int currentRow = 0;
for (set<pair<int, int> >::const_iterator iter = tilesWithExclusions.begin(); iter != tilesWithExclusions.end(); ++iter) {
while (iter->first != currentRow)
exclusionRowIndicesVec[++currentRow] = exclusionIndicesVec.size();
exclusionIndicesVec.push_back(iter->second);
}
exclusionRowIndicesVec[++currentRow] = exclusionIndicesVec.size();
exclusionIndices = new OpenCLArray<cl_uint>(context, exclusionIndicesVec.size(), "exclusionIndices");
exclusionRowIndices = new OpenCLArray<cl_uint>(context, exclusionRowIndicesVec.size(), "exclusionRowIndices");
exclusionIndices->upload(exclusionIndicesVec);
exclusionRowIndices->upload(exclusionRowIndicesVec);
// Record the exclusion data.
exclusions = new OpenCLArray<cl_uint>(context, numWithExclusions*OpenCLContext::TileSize, "exclusions");
exclusions = new OpenCLArray<cl_uint>(context, tilesWithExclusions.size()*OpenCLContext::TileSize, "exclusions");
vector<cl_uint> exclusionVec(exclusions->getSize());
for (int i = 0; i < exclusions->getSize(); ++i)
exclusionVec[i] = 0xFFFFFFFF;
......@@ -174,12 +198,12 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
int y = atom2/OpenCLContext::TileSize;
int offset2 = atom2-y*OpenCLContext::TileSize;
if (x > y) {
int tile = x+y*numAtomBlocks-y*(y+1)/2;
exclusionVec[exclusionIndexVec[tile]+offset1] &= 0xFFFFFFFF-(1<<offset2);
int index = findExclusionIndex(x, y, exclusionIndicesVec, exclusionRowIndicesVec);
exclusionVec[index+offset1] &= 0xFFFFFFFF-(1<<offset2);
}
else {
int tile = y+x*numAtomBlocks-x*(x+1)/2;
exclusionVec[exclusionIndexVec[tile]+offset2] &= 0xFFFFFFFF-(1<<offset1);
int index = findExclusionIndex(y, x, exclusionIndicesVec, exclusionRowIndicesVec);
exclusionVec[index+offset2] &= 0xFFFFFFFF-(1<<offset1);
}
}
}
......@@ -193,19 +217,18 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
int y = atom2/OpenCLContext::TileSize;
int offset2 = atom2-y*OpenCLContext::TileSize;
if (x >= y) {
int tile = x+y*numAtomBlocks-y*(y+1)/2;
exclusionVec[exclusionIndexVec[tile]+offset1] &= 0xFFFFFFFF-(1<<offset2);
int index = findExclusionIndex(x, y, exclusionIndicesVec, exclusionRowIndicesVec);
exclusionVec[index+offset1] &= 0xFFFFFFFF-(1<<offset2);
}
if (y >= x) {
int tile = y+x*numAtomBlocks-x*(x+1)/2;
exclusionVec[exclusionIndexVec[tile]+offset2] &= 0xFFFFFFFF-(1<<offset1);
int index = findExclusionIndex(y, x, exclusionIndicesVec, exclusionRowIndicesVec);
exclusionVec[index+offset2] &= 0xFFFFFFFF-(1<<offset1);
}
}
}
atomExclusions.clear(); // We won't use this again, so free the memory it used
tiles->upload(tileVec);
exclusions->upload(exclusionVec);
exclusionIndex->upload(exclusionIndexVec);
// Create data structures for the neighbor list.
......@@ -252,6 +275,15 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
}
}
int OpenCLNonbondedUtilities::findExclusionIndex(int x, int y, const vector<cl_uint>& exclusionIndices, const vector<cl_uint>& exclusionRowIndices) {
int start = exclusionRowIndices[x];
int end = exclusionRowIndices[x+1];
for (int i = start; i < end; i++)
if (exclusionIndices[i] == y)
return i*OpenCLContext::TileSize;
throw OpenMMException("Internal error: exclusion in unexpected tile");
}
void OpenCLNonbondedUtilities::prepareInteractions() {
if (!useCutoff)
return;
......@@ -275,8 +307,8 @@ void OpenCLNonbondedUtilities::prepareInteractions() {
void OpenCLNonbondedUtilities::computeInteractions() {
if (tiles != NULL) {
if (useCutoff) {
forceKernel.setArg<mm_float4>(10, context.getPeriodicBoxSize());
forceKernel.setArg<mm_float4>(11, context.getInvPeriodicBoxSize());
forceKernel.setArg<mm_float4>(11, context.getPeriodicBoxSize());
forceKernel.setArg<mm_float4>(12, context.getInvPeriodicBoxSize());
}
context.executeKernel(forceKernel, tiles->getSize()*OpenCLContext::TileSize);
}
......@@ -388,6 +420,7 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
defines["CUTOFF_SQUARED"] = OpenCLExpressionUtilities::doubleToString(cutoff*cutoff);
defines["NUM_ATOMS"] = OpenCLExpressionUtilities::intToString(context.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = OpenCLExpressionUtilities::intToString(context.getPaddedNumAtoms());
defines["NUM_BLOCKS"] = OpenCLExpressionUtilities::intToString(context.getNumAtomBlocks());
string file = (context.getSIMDWidth() == 32 ? OpenCLKernelSources::nonbonded_nvidia : OpenCLKernelSources::nonbonded_default);
cl::Program program = context.createProgram(context.replaceStrings(file, replacements), defines);
cl::Kernel kernel(program, "computeNonbonded");
......@@ -399,7 +432,8 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
kernel.setArg<cl::Buffer>(index++, context.getEnergyBuffer().getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, context.getPosq().getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, exclusions->getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, exclusionIndex->getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, exclusionIndices->getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, exclusionRowIndices->getDeviceBuffer());
kernel.setArg(index++, OpenCLContext::ThreadBlockSize*localDataSize, NULL);
kernel.setArg(index++, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL);
if (useCutoff) {
......
......@@ -175,7 +175,13 @@ public:
* Get the array containing the index into the exclusion array for each tile.
*/
OpenCLArray<cl_uint>& getExclusionIndices() {
return *exclusionIndex;
return *exclusionIndices;
}
/**
* Get the array listing where the exclusion data starts for each row.
*/
OpenCLArray<cl_uint>& getExclusionRowIndices() {
return *exclusionRowIndices;
}
/**
* Create a Kernel for evaluating a nonbonded interaction. Cutoffs and periodic boundary conditions
......@@ -190,14 +196,16 @@ public:
*/
cl::Kernel createInteractionKernel(const std::string& source, const std::vector<ParameterInfo>& params, const std::vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric) const;
private:
static int findExclusionIndex(int x, int y, const std::vector<cl_uint>& exclusionIndices, const std::vector<cl_uint>& exclusionRowIndices);
OpenCLContext& context;
cl::Kernel forceKernel;
cl::Kernel findBlockBoundsKernel;
cl::Kernel findInteractingBlocksKernel;
cl::Kernel findInteractionsWithinBlocksKernel;
OpenCLArray<cl_uint>* tiles;
OpenCLArray<cl_uint>* exclusionIndex;
OpenCLArray<cl_uint>* exclusions;
OpenCLArray<cl_uint>* exclusionIndices;
OpenCLArray<cl_uint>* exclusionRowIndices;
OpenCLArray<cl_uint>* interactingTiles;
OpenCLArray<cl_uint>* interactionFlags;
OpenCLArray<cl_uint>* interactionCount;
......
......@@ -9,7 +9,7 @@
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer, __local float4* local_force,
__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions, __global unsigned int* exclusionIndices,
__local float4* tempForceBuffer, __global unsigned int* tiles,
__global unsigned int* exclusionRowIndices, __local float4* tempForceBuffer, __global unsigned int* tiles,
#ifdef USE_CUTOFF
__global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize
#else
......@@ -23,31 +23,54 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF;
__local unsigned int exclusionRange[2];
__local int exclusionIndex[1];
DECLARE_TEMP_BUFFERS
while (pos < end) {
// Extract the coordinates of this tile
#ifdef USE_CUTOFF
unsigned int x = tiles[pos];
unsigned int y = ((x >> 2) & 0x7fff)*TILE_SIZE;
bool hasExclusions = (x & 0x1);
x = (x>>17)*TILE_SIZE;
unsigned int y = ((x >> 2) & 0x7fff);
x = (x>>17);
#else
unsigned int y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
unsigned int x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y++;
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
#endif
unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2);
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int forceBufferOffset = (tgx < TILE_SIZE/2 ? 0 : TILE_SIZE);
unsigned int atom1 = x + tgx;
unsigned int atom1 = x*TILE_SIZE + tgx;
float4 force = 0.0f;
float4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile.
#ifdef USE_EXCLUSIONS
if (get_local_id(0) < 2)
exclusionRange[get_local_id(0)] = exclusionRowIndices[x+get_local_id(0)];
if (tgx == 0)
exclusionIndex[0] = -1;
barrier(CLK_LOCAL_MEM_FENCE);
for (int i = exclusionRange[0]+tgx; i < exclusionRange[1]; i += TILE_SIZE)
if (exclusionIndices[i] == y)
exclusionIndex[0] = i*TILE_SIZE;
barrier(CLK_LOCAL_MEM_FENCE);
bool hasExclusions = (exclusionIndex[0] > -1);
#endif
if (x == y) {
// This tile is on the diagonal.
local_posq[get_local_id(0)] = posq1;
LOAD_LOCAL_PARAMETERS_FROM_1
barrier(CLK_LOCAL_MEM_FENCE);
unsigned int xi = x/TILE_SIZE;
unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*(xi+1)/2;
#ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[exclusionIndices[tile]+tgx] >> baseLocalAtom;
unsigned int excl = exclusions[exclusionIndex[0]+tgx] >> baseLocalAtom;
#endif
for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
#ifdef USE_EXCLUSIONS
......@@ -67,7 +90,7 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
#endif
float r = SQRT(r2);
LOAD_ATOM2_PARAMETERS
atom2 = y+baseLocalAtom+j;
atom2 = y*TILE_SIZE+baseLocalAtom+j;
float dEdR = 0.0f;
float tempEnergy = 0.0f;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
......@@ -94,9 +117,9 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) {
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset1 = x + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS;
unsigned int offset1 = x*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else
unsigned int offset1 = x + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
forceBuffers[offset1].xyz += force.xyz+tempForceBuffer[get_local_id(0)+TILE_SIZE].xyz;
STORE_DERIVATIVES_1
......@@ -106,7 +129,7 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
// This is an off-diagonal tile.
if (lasty != y && get_local_id(0) < TILE_SIZE) {
unsigned int j = y + tgx;
unsigned int j = y*TILE_SIZE + tgx;
local_posq[get_local_id(0)] = posq[j];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
......@@ -116,11 +139,8 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
// Compute the full set of interactions in this tile.
unsigned int xi = x/TILE_SIZE;
unsigned int yi = y/TILE_SIZE;
unsigned int tile = xi+yi*PADDED_NUM_ATOMS/TILE_SIZE-yi*(yi+1)/2;
#ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndices[tile]+tgx] : 0xFFFFFFFF);
unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[0]+tgx] : 0xFFFFFFFF);
excl = (excl >> baseLocalAtom) & 0xFFFF;
excl += excl << 16;
excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
......@@ -144,7 +164,7 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
#endif
float r = SQRT(r2);
LOAD_ATOM2_PARAMETERS
atom2 = y+baseLocalAtom+tj;
atom2 = y*TILE_SIZE+baseLocalAtom+tj;
float dEdR = 0.0f;
float tempEnergy = 0.0f;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
......@@ -176,11 +196,11 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) {
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset1 = x + tgx + (y/TILE_SIZE)*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS;
unsigned int offset1 = x*TILE_SIZE + tgx + y*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else
unsigned int offset1 = x + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
forceBuffers[offset1].xyz += force.xyz+tempForceBuffer[get_local_id(0)+TILE_SIZE].xyz;
forceBuffers[offset2].xyz += local_force[get_local_id(0)].xyz+local_force[get_local_id(0)+TILE_SIZE].xyz;
......
......@@ -9,7 +9,7 @@
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer, __local float4* local_force,
__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions, __global unsigned int* exclusionIndices,
__local float4* tempBuffer, __global unsigned int* tiles,
__global unsigned int* exclusionRowIndices, __local float4* tempBuffer, __global unsigned int* tiles,
#ifdef USE_CUTOFF
__global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize
#else
......@@ -25,28 +25,52 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
unsigned int end = (warp+1)*numTiles/totalWarps;
float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF;
__local unsigned int exclusionRange[4];
__local int exclusionIndex[2];
while (pos < end) {
// Extract the coordinates of this tile
#ifdef USE_CUTOFF
unsigned int x = tiles[pos];
unsigned int y = ((x >> 2) & 0x7fff)*TILE_SIZE;
bool hasExclusions = (x & 0x1);
x = (x>>17)*TILE_SIZE;
unsigned int y = ((x >> 2) & 0x7fff);
x = (x>>17);
#else
unsigned int y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
unsigned int x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y++;
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
#endif
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int tbx = get_local_id(0) - tgx;
unsigned int atom1 = x + tgx;
unsigned int atom1 = x*TILE_SIZE + tgx;
float4 force = 0.0f;
float4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile.
#ifdef USE_EXCLUSIONS
int localGroupIndex = get_local_id(0)/TILE_SIZE;
if (tgx < 2)
exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
if (tgx == 0)
exclusionIndex[localGroupIndex] = -1;
for (int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
if (exclusionIndices[i] == y)
exclusionIndex[localGroupIndex] = i*TILE_SIZE;
bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
#else
bool hasExclusions = false;
#endif
if (x == y) {
// This tile is on the diagonal.
local_posq[get_local_id(0)] = posq1;
LOAD_LOCAL_PARAMETERS_FROM_1
unsigned int xi = x/TILE_SIZE;
unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*(xi+1)/2;
#ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[exclusionIndices[tile]+tgx];
unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
#endif
for (unsigned int j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS
......@@ -66,7 +90,7 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
#endif
float r = SQRT(r2);
LOAD_ATOM2_PARAMETERS
atom2 = y+j;
atom2 = y*TILE_SIZE+j;
float dEdR = 0.0f;
float tempEnergy = 0.0f;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
......@@ -86,9 +110,9 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset1 = x + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS;
unsigned int offset1 = x*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else
unsigned int offset1 = x + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset1 = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
#endif
forceBuffers[offset1].xyz += force.xyz;
STORE_DERIVATIVES_1
......@@ -97,7 +121,7 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
// This is an off-diagonal tile.
if (lasty != y) {
unsigned int j = y + tgx;
unsigned int j = y*TILE_SIZE + tgx;
local_posq[get_local_id(0)] = posq[j];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
......@@ -113,11 +137,8 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
{
// Compute the full set of interactions in this tile.
unsigned int xi = x/TILE_SIZE;
unsigned int yi = y/TILE_SIZE;
unsigned int tile = xi+yi*PADDED_NUM_ATOMS/TILE_SIZE-yi*(yi+1)/2;
#ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndices[tile]+tgx] : 0xFFFFFFFF);
unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
#endif
unsigned int tj = tgx;
......@@ -139,7 +160,7 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
#endif
float r = SQRT(r2);
LOAD_ATOM2_PARAMETERS
atom2 = y+tj;
atom2 = y*TILE_SIZE+tj;
float dEdR = 0.0f;
float tempEnergy = 0.0f;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
......@@ -164,11 +185,11 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset1 = x + tgx + (y/TILE_SIZE)*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS;
unsigned int offset1 = x*TILE_SIZE + tgx + y*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else
unsigned int offset1 = x + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset1 = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
#endif
forceBuffers[offset1].xyz += force.xyz;
forceBuffers[offset2].xyz += local_force[get_local_id(0)].xyz;
......
......@@ -6,7 +6,7 @@
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeN2Value(__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions,
__global unsigned int* exclusionIndices, __global float* global_value, __local float* local_value,
__global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __global float* global_value, __local float* local_value,
__local float* tempBuffer, __global unsigned int* tiles,
#ifdef USE_CUTOFF
__global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize
......@@ -21,30 +21,53 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF;
__local unsigned int exclusionRange[2];
__local int exclusionIndex[1];
while (pos < end) {
// Extract the coordinates of this tile
#ifdef USE_CUTOFF
unsigned int x = tiles[pos];
unsigned int y = ((x >> 2) & 0x7fff)*TILE_SIZE;
bool hasExclusions = (x & 0x1);
x = (x>>17)*TILE_SIZE;
unsigned int y = ((x >> 2) & 0x7fff);
x = (x>>17);
#else
unsigned int y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
unsigned int x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y++;
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
#endif
unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2);
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int valueBufferOffset = (tgx < TILE_SIZE/2 ? 0 : TILE_SIZE);
unsigned int atom1 = x + tgx;
unsigned int atom1 = x*TILE_SIZE + tgx;
float value = 0.0f;
float4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile.
#ifdef USE_EXCLUSIONS
if (get_local_id(0) < 2)
exclusionRange[get_local_id(0)] = exclusionRowIndices[x+get_local_id(0)];
if (tgx == 0)
exclusionIndex[0] = -1;
barrier(CLK_LOCAL_MEM_FENCE);
for (int i = exclusionRange[0]+tgx; i < exclusionRange[1]; i += TILE_SIZE)
if (exclusionIndices[i] == y)
exclusionIndex[0] = i*TILE_SIZE;
barrier(CLK_LOCAL_MEM_FENCE);
bool hasExclusions = (exclusionIndex[0] > -1);
#endif
if (x == y) {
// This tile is on the diagonal.
local_posq[get_local_id(0)] = posq1;
LOAD_LOCAL_PARAMETERS_FROM_1
barrier(CLK_LOCAL_MEM_FENCE);
unsigned int xi = x/TILE_SIZE;
unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*(xi+1)/2;
#ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[exclusionIndices[tile]+tgx] >> baseLocalAtom;
unsigned int excl = exclusions[exclusionIndex[0]+tgx] >> baseLocalAtom;
#endif
for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
#ifdef USE_EXCLUSIONS
......@@ -64,7 +87,7 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
#endif
float r = SQRT(r2);
LOAD_ATOM2_PARAMETERS
atom2 = y+baseLocalAtom+j;
atom2 = y*TILE_SIZE+baseLocalAtom+j;
float tempValue1 = 0.0f;
float tempValue2 = 0.0f;
#ifdef USE_EXCLUSIONS
......@@ -90,9 +113,9 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) {
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset = x + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS;
unsigned int offset = x*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else
unsigned int offset = x + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
global_value[offset] += value+tempBuffer[get_local_id(0)+TILE_SIZE];
}
......@@ -101,7 +124,7 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
// This is an off-diagonal tile.
if (lasty != y && get_local_id(0) < TILE_SIZE) {
unsigned int j = y + tgx;
unsigned int j = y*TILE_SIZE + tgx;
local_posq[get_local_id(0)] = posq[j];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
......@@ -110,11 +133,8 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
// Compute the full set of interactions in this tile.
unsigned int xi = x/TILE_SIZE;
unsigned int yi = y/TILE_SIZE;
unsigned int tile = xi+yi*PADDED_NUM_ATOMS/TILE_SIZE-yi*(yi+1)/2;
#ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndices[tile]+tgx] : 0xFFFFFFFF);
unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[0]+tgx] : 0xFFFFFFFF);
excl = (excl >> baseLocalAtom) & 0xFFFF;
excl += excl << 16;
excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
......@@ -138,7 +158,7 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
#endif
float r = SQRT(r2);
LOAD_ATOM2_PARAMETERS
atom2 = y+baseLocalAtom+tj;
atom2 = y*TILE_SIZE+baseLocalAtom+tj;
float tempValue1 = 0.0f;
float tempValue2 = 0.0f;
#ifdef USE_EXCLUSIONS
......@@ -167,11 +187,11 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) {
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset1 = x + tgx + (y/TILE_SIZE)*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS;
unsigned int offset1 = x*TILE_SIZE + tgx + y*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else
unsigned int offset1 = x + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
global_value[offset1] += value+tempBuffer[get_local_id(0)+TILE_SIZE];
global_value[offset2] += local_value[get_local_id(0)]+local_value[get_local_id(0)+TILE_SIZE];
......
......@@ -6,7 +6,7 @@
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeN2Value(__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions,
__global unsigned int* exclusionIndices, __global float* global_value, __local float* local_value,
__global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __global float* global_value, __local float* local_value,
__local float* tempBuffer, __global unsigned int* tiles,
#ifdef USE_CUTOFF
__global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize
......@@ -23,28 +23,52 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
unsigned int end = (warp+1)*numTiles/totalWarps;
float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF;
__local unsigned int exclusionRange[4];
__local int exclusionIndex[2];
while (pos < end) {
// Extract the coordinates of this tile
#ifdef USE_CUTOFF
unsigned int x = tiles[pos];
unsigned int y = ((x >> 2) & 0x7fff)*TILE_SIZE;
bool hasExclusions = (x & 0x1);
x = (x>>17)*TILE_SIZE;
unsigned int y = ((x >> 2) & 0x7fff);
x = (x>>17);
#else
unsigned int y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
unsigned int x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y++;
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
#endif
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int tbx = get_local_id(0) - tgx;
unsigned int atom1 = x + tgx;
unsigned int atom1 = x*TILE_SIZE + tgx;
float value = 0.0f;
float4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile.
#ifdef USE_EXCLUSIONS
int localGroupIndex = get_local_id(0)/TILE_SIZE;
if (tgx < 2)
exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
if (tgx == 0)
exclusionIndex[localGroupIndex] = -1;
for (int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
if (exclusionIndices[i] == y)
exclusionIndex[localGroupIndex] = i*TILE_SIZE;
bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
#else
bool hasExclusions = false;
#endif
if (x == y) {
// This tile is on the diagonal.
local_posq[get_local_id(0)] = posq1;
LOAD_LOCAL_PARAMETERS_FROM_1
unsigned int xi = x/TILE_SIZE;
unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*(xi+1)/2;
#ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[exclusionIndices[tile]+tgx];
unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
#endif
for (unsigned int j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS
......@@ -64,7 +88,7 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
#endif
float r = SQRT(r2);
LOAD_ATOM2_PARAMETERS
atom2 = y+j;
atom2 = y*TILE_SIZE+j;
float tempValue1 = 0.0f;
float tempValue2 = 0.0f;
#ifdef USE_EXCLUSIONS
......@@ -85,9 +109,9 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset = x + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS;
unsigned int offset = x*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else
unsigned int offset = x + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
#endif
global_value[offset] += value;
}
......@@ -95,7 +119,7 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
// This is an off-diagonal tile.
if (lasty != y) {
unsigned int j = y + tgx;
unsigned int j = y*TILE_SIZE + tgx;
local_posq[get_local_id(0)] = posq[j];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
......@@ -125,7 +149,7 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
if (r2 < CUTOFF_SQUARED) {
float r = SQRT(r2);
LOAD_ATOM2_PARAMETERS
atom2 = y+j;
atom2 = y*TILE_SIZE+j;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
COMPUTE_VALUE
}
......@@ -154,11 +178,8 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
{
// Compute the full set of interactions in this tile.
unsigned int xi = x/TILE_SIZE;
unsigned int yi = y/TILE_SIZE;
unsigned int tile = xi+yi*PADDED_NUM_ATOMS/TILE_SIZE-yi*(yi+1)/2;
#ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndices[tile]+tgx] : 0xFFFFFFFF);
unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
#endif
unsigned int tj = tgx;
......@@ -180,7 +201,7 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
#endif
float r = SQRT(r2);
LOAD_ATOM2_PARAMETERS
atom2 = y+tj;
atom2 = y*TILE_SIZE+tj;
float tempValue1 = 0.0f;
float tempValue2 = 0.0f;
#ifdef USE_EXCLUSIONS
......@@ -204,11 +225,11 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset1 = x + tgx + (y/TILE_SIZE)*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS;
unsigned int offset1 = x*TILE_SIZE + tgx + y*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else
unsigned int offset1 = x + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset1 = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
#endif
global_value[offset1] += value;
global_value[offset2] += local_value[get_local_id(0)];
......
......@@ -31,13 +31,22 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
while (pos < end) {
// Extract the coordinates of this tile
#ifdef USE_CUTOFF
unsigned int x = tiles[pos];
unsigned int y = ((x >> 2) & 0x7fff)*TILE_SIZE;
x = (x>>17)*TILE_SIZE;
unsigned int y = ((x >> 2) & 0x7fff);
x = (x>>17);
#else
unsigned int y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
unsigned int x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y++;
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
#endif
unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2);
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int forceBufferOffset = (tgx < TILE_SIZE/2 ? 0 : TILE_SIZE);
unsigned int atom1 = x + tgx;
unsigned int atom1 = x*TILE_SIZE + tgx;
float bornSum = 0.0f;
float4 posq1 = posq[atom1];
float2 params1 = global_params[atom1];
......@@ -51,8 +60,6 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
localData[get_local_id(0)].radius = params1.x;
localData[get_local_id(0)].scaledRadius = params1.y;
barrier(CLK_LOCAL_MEM_FENCE);
unsigned int xi = x/TILE_SIZE;
unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*(xi+1)/2;
for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
float4 delta = (float4) (localData[baseLocalAtom+j].x-posq1.x, localData[baseLocalAtom+j].y-posq1.y, localData[baseLocalAtom+j].z-posq1.z, 0.0f);
#ifdef USE_PERIODIC
......@@ -66,9 +73,9 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
float2 params2 = (float2) (localData[baseLocalAtom+j].radius, localData[baseLocalAtom+j].scaledRadius);
float rScaledRadiusJ = r+params2.y;
#ifdef USE_CUTOFF
unsigned int includeInteraction = (atom1 < NUM_ATOMS && y+baseLocalAtom+j < NUM_ATOMS && r2 < CUTOFF_SQUARED && (j+baseLocalAtom != tgx) && (params1.x < rScaledRadiusJ));
unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+baseLocalAtom+j < NUM_ATOMS && r2 < CUTOFF_SQUARED && (j+baseLocalAtom != tgx) && (params1.x < rScaledRadiusJ));
#else
unsigned int includeInteraction = (atom1 < NUM_ATOMS && y+baseLocalAtom+j < NUM_ATOMS && (j+baseLocalAtom != tgx) && (params1.x < rScaledRadiusJ));
unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+baseLocalAtom+j < NUM_ATOMS && (j+baseLocalAtom != tgx) && (params1.x < rScaledRadiusJ));
#endif
float l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
float u_ij = RECIP(rScaledRadiusJ);
......@@ -87,9 +94,9 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) {
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset = x + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS;
unsigned int offset = x*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else
unsigned int offset = x + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
global_bornSum[offset] += bornSum+tempBuffer[get_local_id(0)+TILE_SIZE];
}
......@@ -98,7 +105,7 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
// This is an off-diagonal tile.
if (lasty != y && get_local_id(0) < TILE_SIZE) {
unsigned int j = y + tgx;
unsigned int j = y*TILE_SIZE + tgx;
float4 tempPosq = posq[j];
localData[get_local_id(0)].x = tempPosq.x;
localData[get_local_id(0)].y = tempPosq.y;
......@@ -113,9 +120,6 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
// Compute the full set of interactions in this tile.
unsigned int xi = x/TILE_SIZE;
unsigned int yi = y/TILE_SIZE;
unsigned int tile = xi+yi*PADDED_NUM_ATOMS/TILE_SIZE-yi*(yi+1)/2;
unsigned int tj = tgx%(TILE_SIZE/2);
for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
float4 delta = (float4) (localData[baseLocalAtom+tj].x-posq1.x, localData[baseLocalAtom+tj].y-posq1.y, localData[baseLocalAtom+tj].z-posq1.z, 0.0f);
......@@ -126,9 +130,9 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
unsigned int includeInteraction = (atom1 < NUM_ATOMS && y+baseLocalAtom+tj < NUM_ATOMS && r2 < CUTOFF_SQUARED);
unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+baseLocalAtom+tj < NUM_ATOMS && r2 < CUTOFF_SQUARED);
#else
unsigned int includeInteraction = (atom1 < NUM_ATOMS && y+baseLocalAtom+tj < NUM_ATOMS);
unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+baseLocalAtom+tj < NUM_ATOMS);
#endif
float invR = RSQRT(r2);
float r = RECIP(invR);
......@@ -168,11 +172,11 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) {
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset1 = x + tgx + (y/TILE_SIZE)*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS;
unsigned int offset1 = x*TILE_SIZE + tgx + y*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else
unsigned int offset1 = x + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
global_bornSum[offset1] += bornSum+tempBuffer[get_local_id(0)+TILE_SIZE];
global_bornSum[offset2] += localData[get_local_id(0)].bornSum+localData[get_local_id(0)+TILE_SIZE].bornSum;
......@@ -206,13 +210,22 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
while (pos < end) {
// Extract the coordinates of this tile
#ifdef USE_CUTOFF
unsigned int x = tiles[pos];
unsigned int y = ((x >> 2) & 0x7fff)*TILE_SIZE;
x = (x>>17)*TILE_SIZE;
unsigned int y = ((x >> 2) & 0x7fff);
x = (x>>17);
#else
unsigned int y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
unsigned int x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y++;
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
#endif
unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2);
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int forceBufferOffset = (tgx < TILE_SIZE/2 ? 0 : TILE_SIZE);
unsigned int atom1 = x + tgx;
unsigned int atom1 = x*TILE_SIZE + tgx;
float4 force = 0.0f;
float4 posq1 = posq[atom1];
float bornRadius1 = global_bornRadii[atom1];
......@@ -225,10 +238,8 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
localData[get_local_id(0)].q = posq1.w;
localData[get_local_id(0)].bornRadius = bornRadius1;
barrier(CLK_LOCAL_MEM_FENCE);
unsigned int xi = x/TILE_SIZE;
unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*(xi+1)/2;
for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
unsigned int includeInteraction = (atom1 < NUM_ATOMS && y+baseLocalAtom+j < NUM_ATOMS);
unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+baseLocalAtom+j < NUM_ATOMS);
float4 posq2 = (float4) (localData[baseLocalAtom+j].x, localData[baseLocalAtom+j].y, localData[baseLocalAtom+j].z, localData[baseLocalAtom+j].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
......@@ -266,9 +277,9 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) {
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset = x + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS;
unsigned int offset = x*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else
unsigned int offset = x + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz+tempBuffer[get_local_id(0)+TILE_SIZE].xyz;
global_bornForce[offset] += force.w+tempBuffer[get_local_id(0)+TILE_SIZE].w;
......@@ -278,7 +289,7 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
// This is an off-diagonal tile.
if (lasty != y && get_local_id(0) < TILE_SIZE) {
unsigned int j = y + tgx;
unsigned int j = y*TILE_SIZE + tgx;
float4 tempPosq = posq[j];
localData[get_local_id(0)].x = tempPosq.x;
localData[get_local_id(0)].y = tempPosq.y;
......@@ -294,12 +305,9 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
// Compute the full set of interactions in this tile.
unsigned int xi = x/TILE_SIZE;
unsigned int yi = y/TILE_SIZE;
unsigned int tile = xi+yi*PADDED_NUM_ATOMS/TILE_SIZE-yi*(yi+1)/2;
unsigned int tj = tgx%(TILE_SIZE/2);
for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
unsigned int includeInteraction = (atom1 < NUM_ATOMS && y+baseLocalAtom+tj < NUM_ATOMS);
unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+baseLocalAtom+tj < NUM_ATOMS);
float4 posq2 = (float4) (localData[baseLocalAtom+tj].x, localData[baseLocalAtom+tj].y, localData[baseLocalAtom+tj].z, localData[baseLocalAtom+tj].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
......@@ -343,11 +351,11 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) {
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset1 = x + tgx + (y/TILE_SIZE)*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS;
unsigned int offset1 = x*TILE_SIZE + tgx + y*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else
unsigned int offset1 = x + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
forceBuffers[offset1].xyz = forceBuffers[offset1].xyz+force.xyz+tempBuffer[get_local_id(0)+TILE_SIZE].xyz;
float4 sum = (float4) (localData[get_local_id(0)].fx+localData[get_local_id(0)+TILE_SIZE].fx,
......
......@@ -33,12 +33,21 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
while (pos < end) {
// Extract the coordinates of this tile
#ifdef USE_CUTOFF
unsigned int x = tiles[pos];
unsigned int y = ((x >> 2) & 0x7fff)*TILE_SIZE;
x = (x>>17)*TILE_SIZE;
unsigned int y = ((x >> 2) & 0x7fff);
x = (x>>17);
#else
unsigned int y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
unsigned int x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y++;
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
#endif
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int tbx = get_local_id(0) - tgx;
unsigned int atom1 = x + tgx;
unsigned int atom1 = x*TILE_SIZE + tgx;
float bornSum = 0.0f;
float4 posq1 = posq[atom1];
float2 params1 = global_params[atom1];
......@@ -51,8 +60,6 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
localData[get_local_id(0)].q = posq1.w;
localData[get_local_id(0)].radius = params1.x;
localData[get_local_id(0)].scaledRadius = params1.y;
unsigned int xi = x/TILE_SIZE;
unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*(xi+1)/2;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
float4 delta = (float4) (localData[tbx+j].x-posq1.x, localData[tbx+j].y-posq1.y, localData[tbx+j].z-posq1.z, 0.0f);
#ifdef USE_PERIODIC
......@@ -62,9 +69,9 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (atom1 < NUM_ATOMS && y+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
if (atom1 < NUM_ATOMS && y+j < NUM_ATOMS) {
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
#endif
float invR = RSQRT(r2);
float r = RECIP(invR);
......@@ -86,9 +93,9 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset = x + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS;
unsigned int offset = x*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else
unsigned int offset = x + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
#endif
global_bornSum[offset] += bornSum;
}
......@@ -96,7 +103,7 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
// This is an off-diagonal tile.
if (lasty != y) {
unsigned int j = y + tgx;
unsigned int j = y*TILE_SIZE + tgx;
float4 tempPosq = posq[j];
localData[get_local_id(0)].x = tempPosq.x;
localData[get_local_id(0)].y = tempPosq.y;
......@@ -127,9 +134,9 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
tempBuffer[get_local_id(0)] = 0.0f;
#ifdef USE_CUTOFF
if (atom1 < NUM_ATOMS && y+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
if (atom1 < NUM_ATOMS && y+j < NUM_ATOMS) {
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
#endif
float invR = RSQRT(r2);
float r = RECIP(invR);
......@@ -182,9 +189,6 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
{
// Compute the full set of interactions in this tile.
unsigned int xi = x/TILE_SIZE;
unsigned int yi = y/TILE_SIZE;
unsigned int tile = xi+yi*PADDED_NUM_ATOMS/TILE_SIZE-yi*(yi+1)/2;
unsigned int tj = tgx;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
float4 delta = (float4) (localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z, 0.0f);
......@@ -195,9 +199,9 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (atom1 < NUM_ATOMS && y+tj < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
if (atom1 < NUM_ATOMS && y+tj < NUM_ATOMS) {
if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS) {
#endif
float invR = RSQRT(r2);
float r = RECIP(invR);
......@@ -235,11 +239,11 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
// Write results
float4 of;
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset1 = x + tgx + (y/TILE_SIZE)*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS;
unsigned int offset1 = x*TILE_SIZE + tgx + y*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else
unsigned int offset1 = x + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset1 = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
#endif
global_bornSum[offset1] += bornSum;
global_bornSum[offset2] += localData[get_local_id(0)].bornSum;
......@@ -274,12 +278,21 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
while (pos < end) {
// Extract the coordinates of this tile
#ifdef USE_CUTOFF
unsigned int x = tiles[pos];
unsigned int y = ((x >> 2) & 0x7fff)*TILE_SIZE;
x = (x>>17)*TILE_SIZE;
unsigned int y = ((x >> 2) & 0x7fff);
x = (x>>17);
#else
unsigned int y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
unsigned int x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y++;
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
#endif
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int tbx = get_local_id(0) - tgx;
unsigned int atom1 = x + tgx;
unsigned int atom1 = x*TILE_SIZE + tgx;
float4 force = 0.0f;
float4 posq1 = posq[atom1];
float bornRadius1 = global_bornRadii[atom1];
......@@ -291,10 +304,8 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
localData[get_local_id(0)].z = posq1.z;
localData[get_local_id(0)].q = posq1.w;
localData[get_local_id(0)].bornRadius = bornRadius1;
unsigned int xi = x/TILE_SIZE;
unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*(xi+1)/2;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if (atom1 < NUM_ATOMS && y+j < NUM_ATOMS) {
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
float4 posq2 = (float4) (localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z, localData[tbx+j].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
......@@ -330,9 +341,9 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset = x + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS;
unsigned int offset = x*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else
unsigned int offset = x + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
#endif
forceBuffers[offset].xyz += force.xyz;
global_bornForce[offset] += force.w;
......@@ -341,7 +352,7 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
// This is an off-diagonal tile.
if (lasty != y) {
unsigned int j = y + tgx;
unsigned int j = y*TILE_SIZE + tgx;
float4 tempPosq = posq[j];
localData[get_local_id(0)].x = tempPosq.x;
localData[get_local_id(0)].y = tempPosq.y;
......@@ -386,9 +397,9 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
force.w += dGpol_dalpha2_ij*bornRadius2;
float dEdR = Gpol*(1.0f - 0.25f*expTerm);
#ifdef USE_CUTOFF
if (atom1 >= NUM_ATOMS || y+j >= NUM_ATOMS || r2 > CUTOFF_SQUARED) {
if (atom1 >= NUM_ATOMS || y*TILE_SIZE+j >= NUM_ATOMS || r2 > CUTOFF_SQUARED) {
#else
if (atom1 >= NUM_ATOMS || y+j >= NUM_ATOMS) {
if (atom1 >= NUM_ATOMS || y*TILE_SIZE+j >= NUM_ATOMS) {
#endif
dEdR = 0.0f;
tempEnergy = 0.0f;
......@@ -424,12 +435,9 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
{
// Compute the full set of interactions in this tile.
unsigned int xi = x/TILE_SIZE;
unsigned int yi = y/TILE_SIZE;
unsigned int tile = xi+yi*PADDED_NUM_ATOMS/TILE_SIZE-yi*(yi+1)/2;
unsigned int tj = tgx;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if (atom1 < NUM_ATOMS && y+tj < NUM_ATOMS) {
if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS) {
float4 posq2 = (float4) (localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z, localData[tbx+tj].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
......@@ -471,11 +479,11 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset1 = x + tgx + (y/TILE_SIZE)*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS;
unsigned int offset1 = x*TILE_SIZE + tgx + y*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else
unsigned int offset1 = x + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset1 = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
#endif
forceBuffers[offset1].xyz += force.xyz;
forceBuffers[offset2] += (float4) (localData[get_local_id(0)].fx, localData[get_local_id(0)].fy, localData[get_local_id(0)].fz, 0);
......
......@@ -13,7 +13,7 @@ typedef struct {
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions,
__global unsigned int* exclusionIndices, __local AtomData* localData, __local float4* tempBuffer, __global unsigned int* tiles,
__global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __local AtomData* localData, __local float4* tempBuffer, __global unsigned int* tiles,
#ifdef USE_CUTOFF
__global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize
#else
......@@ -27,20 +27,45 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF;
__local unsigned int exclusionRange[2];
__local int exclusionIndex[1];
while (pos < end) {
// Extract the coordinates of this tile
#ifdef USE_CUTOFF
unsigned int x = tiles[pos];
unsigned int y = ((x >> 2) & 0x7fff)*TILE_SIZE;
bool hasExclusions = (x & 0x1);
x = (x>>17)*TILE_SIZE;
unsigned int y = ((x >> 2) & 0x7fff);
x = (x>>17);
#else
unsigned int y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
unsigned int x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y++;
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
#endif
unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2);
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int forceBufferOffset = (tgx < TILE_SIZE/2 ? 0 : TILE_SIZE);
unsigned int atom1 = x + tgx;
unsigned int atom1 = x*TILE_SIZE + tgx;
float4 force = 0.0f;
float4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile.
#ifdef USE_EXCLUSIONS
if (get_local_id(0) < 2)
exclusionRange[get_local_id(0)] = exclusionRowIndices[x+get_local_id(0)];
if (tgx == 0)
exclusionIndex[0] = -1;
barrier(CLK_LOCAL_MEM_FENCE);
for (int i = exclusionRange[0]+tgx; i < exclusionRange[1]; i += TILE_SIZE)
if (exclusionIndices[i] == y)
exclusionIndex[0] = i*TILE_SIZE;
barrier(CLK_LOCAL_MEM_FENCE);
bool hasExclusions = (exclusionIndex[0] > -1);
#endif
if (x == y) {
// This tile is on the diagonal.
......@@ -50,10 +75,8 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
localData[get_local_id(0)].q = posq1.w;
LOAD_LOCAL_PARAMETERS_FROM_1
barrier(CLK_LOCAL_MEM_FENCE);
unsigned int xi = x/TILE_SIZE;
unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*(xi+1)/2;
#ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[exclusionIndices[tile]+tgx] >> baseLocalAtom;
unsigned int excl = exclusions[exclusionIndex[0]+tgx] >> baseLocalAtom;
#endif
for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
#ifdef USE_EXCLUSIONS
......@@ -71,7 +94,7 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
float invR = RSQRT(r2);
float r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y+baseLocalAtom+j;
atom2 = y*TILE_SIZE+baseLocalAtom+j;
#ifdef USE_SYMMETRIC
float dEdR = 0.0f;
#else
......@@ -96,9 +119,9 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) {
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset = x + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS;
unsigned int offset = x*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else
unsigned int offset = x + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz+tempBuffer[get_local_id(0)+TILE_SIZE].xyz;
}
......@@ -107,7 +130,7 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
// This is an off-diagonal tile.
if (lasty != y && get_local_id(0) < TILE_SIZE) {
unsigned int j = y + tgx;
unsigned int j = y*TILE_SIZE + tgx;
float4 tempPosq = posq[j];
localData[get_local_id(0)].x = tempPosq.x;
localData[get_local_id(0)].y = tempPosq.y;
......@@ -122,11 +145,8 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
// Compute the full set of interactions in this tile.
unsigned int xi = x/TILE_SIZE;
unsigned int yi = y/TILE_SIZE;
unsigned int tile = xi+yi*PADDED_NUM_ATOMS/TILE_SIZE-yi*(yi+1)/2;
#ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndices[tile]+tgx] : 0xFFFFFFFF);
unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[0]+tgx] : 0xFFFFFFFF);
excl = (excl >> baseLocalAtom) & 0xFFFF;
excl += excl << 16;
excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
......@@ -148,7 +168,7 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
float invR = RSQRT(r2);
float r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y+baseLocalAtom+tj;
atom2 = y*TILE_SIZE+baseLocalAtom+tj;
#ifdef USE_SYMMETRIC
float dEdR = 0.0f;
#else
......@@ -182,11 +202,11 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) {
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset1 = x + tgx + (y/TILE_SIZE)*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS;
unsigned int offset1 = x*TILE_SIZE + tgx + y*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else
unsigned int offset1 = x + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
forceBuffers[offset1].xyz = forceBuffers[offset1].xyz+force.xyz+tempBuffer[get_local_id(0)+TILE_SIZE].xyz;
float4 sum = (float4) (localData[get_local_id(0)].fx+localData[get_local_id(0)+TILE_SIZE].fx, localData[get_local_id(0)].fy+localData[get_local_id(0)+TILE_SIZE].fy, localData[get_local_id(0)].fz+localData[get_local_id(0)+TILE_SIZE].fz, 0.0f);
......
......@@ -13,7 +13,7 @@ typedef struct {
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions,
__global unsigned int* exclusionIndices, __local AtomData* localData, __local float4* tempBuffer, __global unsigned int* tiles,
__global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __local AtomData* localData, __local float4* tempBuffer, __global unsigned int* tiles,
#ifdef USE_CUTOFF
__global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize
#else
......@@ -29,19 +29,45 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
unsigned int end = (warp+1)*numTiles/totalWarps;
float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF;
__local unsigned int exclusionRange[4];
__local int exclusionIndex[2];
while (pos < end) {
// Extract the coordinates of this tile
#ifdef USE_CUTOFF
unsigned int x = tiles[pos];
unsigned int y = ((x >> 2) & 0x7fff)*TILE_SIZE;
bool hasExclusions = (x & 0x1);
x = (x>>17)*TILE_SIZE;
unsigned int y = ((x >> 2) & 0x7fff);
x = (x>>17);
#else
unsigned int y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
unsigned int x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y++;
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
#endif
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int tbx = get_local_id(0) - tgx;
unsigned int atom1 = x + tgx;
unsigned int atom1 = x*TILE_SIZE + tgx;
float4 force = 0.0f;
float4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile.
#ifdef USE_EXCLUSIONS
int localGroupIndex = get_local_id(0)/TILE_SIZE;
if (tgx < 2)
exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
if (tgx == 0)
exclusionIndex[localGroupIndex] = -1;
for (int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
if (exclusionIndices[i] == y)
exclusionIndex[localGroupIndex] = i*TILE_SIZE;
bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
#else
bool hasExclusions = false;
#endif
if (x == y) {
// This tile is on the diagonal.
......@@ -50,10 +76,8 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
localData[get_local_id(0)].z = posq1.z;
localData[get_local_id(0)].q = posq1.w;
LOAD_LOCAL_PARAMETERS_FROM_1
unsigned int xi = x/TILE_SIZE;
unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*(xi+1)/2;
#ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[exclusionIndices[tile]+tgx];
unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
#endif
for (unsigned int j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS
......@@ -71,7 +95,7 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
float r = sqrt(r2);
float invR = RECIP(r);
LOAD_ATOM2_PARAMETERS
atom2 = y+j;
atom2 = y*TILE_SIZE+j;
#ifdef USE_SYMMETRIC
float dEdR = 0.0f;
#else
......@@ -91,9 +115,9 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset = x + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS;
unsigned int offset = x*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else
unsigned int offset = x + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
#endif
forceBuffers[offset].xyz += force.xyz;
}
......@@ -101,7 +125,7 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
// This is an off-diagonal tile.
if (lasty != y) {
unsigned int j = y + tgx;
unsigned int j = y*TILE_SIZE + tgx;
float4 tempPosq = posq[j];
localData[get_local_id(0)].x = tempPosq.x;
localData[get_local_id(0)].y = tempPosq.y;
......@@ -136,7 +160,7 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
float invR = RSQRT(r2);
float r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y+j;
atom2 = y*TILE_SIZE+j;
#ifdef USE_SYMMETRIC
float dEdR = 0.0f;
#else
......@@ -179,11 +203,8 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
{
// Compute the full set of interactions in this tile.
unsigned int xi = x/TILE_SIZE;
unsigned int yi = y/TILE_SIZE;
unsigned int tile = xi+yi*PADDED_NUM_ATOMS/TILE_SIZE-yi*(yi+1)/2;
#ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndices[tile]+tgx] : 0xFFFFFFFF);
unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
#endif
unsigned int tj = tgx;
......@@ -203,7 +224,7 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
float invR = RSQRT(r2);
float r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y+tj;
atom2 = y*TILE_SIZE+tj;
#ifdef USE_SYMMETRIC
float dEdR = 0.0f;
#else
......@@ -232,11 +253,11 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset1 = x + tgx + (y/TILE_SIZE)*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS;
unsigned int offset1 = x*TILE_SIZE + tgx + y*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else
unsigned int offset1 = x + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset1 = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
#endif
forceBuffers[offset1].xyz += force.xyz;
forceBuffers[offset2] += (float4) (localData[get_local_id(0)].fx, localData[get_local_id(0)].fy, localData[get_local_id(0)].fz, 0.0f);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment