Commit 9a4e2b02 authored by Peter Eastman's avatar Peter Eastman
Browse files

Beginning of changes to reduce memory use for large systems

parent 2bdb84bd
...@@ -1690,6 +1690,7 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF ...@@ -1690,6 +1690,7 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
defines["PREFACTOR"] = doubleToString(prefactor); defines["PREFACTOR"] = doubleToString(prefactor);
defines["NUM_ATOMS"] = intToString(cl.getNumAtoms()); defines["NUM_ATOMS"] = intToString(cl.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = intToString(cl.getPaddedNumAtoms()); defines["PADDED_NUM_ATOMS"] = intToString(cl.getPaddedNumAtoms());
defines["NUM_BLOCKS"] = OpenCLExpressionUtilities::intToString(cl.getNumAtomBlocks());
string file = (cl.getSIMDWidth() == 32 ? OpenCLKernelSources::gbsaObc_nvidia : OpenCLKernelSources::gbsaObc_default); string file = (cl.getSIMDWidth() == 32 ? OpenCLKernelSources::gbsaObc_nvidia : OpenCLKernelSources::gbsaObc_default);
cl::Program program = cl.createProgram(file, defines); cl::Program program = cl.createProgram(file, defines);
int index = 0; int index = 0;
...@@ -1985,6 +1986,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo ...@@ -1985,6 +1986,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
defines["CUTOFF_SQUARED"] = doubleToString(force.getCutoffDistance()*force.getCutoffDistance()); defines["CUTOFF_SQUARED"] = doubleToString(force.getCutoffDistance()*force.getCutoffDistance());
defines["NUM_ATOMS"] = intToString(cl.getNumAtoms()); defines["NUM_ATOMS"] = intToString(cl.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = intToString(cl.getPaddedNumAtoms()); defines["PADDED_NUM_ATOMS"] = intToString(cl.getPaddedNumAtoms());
defines["NUM_BLOCKS"] = OpenCLExpressionUtilities::intToString(cl.getNumAtomBlocks());
string file = (cl.getSIMDWidth() == 32 ? OpenCLKernelSources::customGBValueN2_nvidia : OpenCLKernelSources::customGBValueN2_default); string file = (cl.getSIMDWidth() == 32 ? OpenCLKernelSources::customGBValueN2_nvidia : OpenCLKernelSources::customGBValueN2_default);
cl::Program program = cl.createProgram(cl.replaceStrings(file, replacements), defines); cl::Program program = cl.createProgram(cl.replaceStrings(file, replacements), defines);
pairValueKernel = cl::Kernel(program, "computeN2Value"); pairValueKernel = cl::Kernel(program, "computeN2Value");
...@@ -2131,6 +2133,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo ...@@ -2131,6 +2133,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
defines["CUTOFF_SQUARED"] = doubleToString(force.getCutoffDistance()*force.getCutoffDistance()); defines["CUTOFF_SQUARED"] = doubleToString(force.getCutoffDistance()*force.getCutoffDistance());
defines["NUM_ATOMS"] = intToString(cl.getNumAtoms()); defines["NUM_ATOMS"] = intToString(cl.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = intToString(cl.getPaddedNumAtoms()); defines["PADDED_NUM_ATOMS"] = intToString(cl.getPaddedNumAtoms());
defines["NUM_BLOCKS"] = OpenCLExpressionUtilities::intToString(cl.getNumAtomBlocks());
string file = (cl.getSIMDWidth() == 32 ? OpenCLKernelSources::customGBEnergyN2_nvidia : OpenCLKernelSources::customGBEnergyN2_default); string file = (cl.getSIMDWidth() == 32 ? OpenCLKernelSources::customGBEnergyN2_nvidia : OpenCLKernelSources::customGBEnergyN2_default);
cl::Program program = cl.createProgram(cl.replaceStrings(file, replacements), defines); cl::Program program = cl.createProgram(cl.replaceStrings(file, replacements), defines);
pairEnergyKernel = cl::Kernel(program, "computeN2Energy"); pairEnergyKernel = cl::Kernel(program, "computeN2Energy");
...@@ -2393,6 +2396,7 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include ...@@ -2393,6 +2396,7 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
pairValueKernel.setArg(index++, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL); pairValueKernel.setArg(index++, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL);
pairValueKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusions().getDeviceBuffer()); pairValueKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusions().getDeviceBuffer());
pairValueKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionIndices().getDeviceBuffer()); pairValueKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionIndices().getDeviceBuffer());
pairValueKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionRowIndices().getDeviceBuffer());
pairValueKernel.setArg<cl::Buffer>(index++, valueBuffers->getDeviceBuffer()); pairValueKernel.setArg<cl::Buffer>(index++, valueBuffers->getDeviceBuffer());
pairValueKernel.setArg(index++, OpenCLContext::ThreadBlockSize*sizeof(cl_float), NULL); pairValueKernel.setArg(index++, OpenCLContext::ThreadBlockSize*sizeof(cl_float), NULL);
pairValueKernel.setArg(index++, OpenCLContext::ThreadBlockSize*sizeof(cl_float), NULL); pairValueKernel.setArg(index++, OpenCLContext::ThreadBlockSize*sizeof(cl_float), NULL);
...@@ -2442,6 +2446,7 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include ...@@ -2442,6 +2446,7 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
pairEnergyKernel.setArg(index++, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL); pairEnergyKernel.setArg(index++, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL);
pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusions().getDeviceBuffer()); pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusions().getDeviceBuffer());
pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionIndices().getDeviceBuffer()); pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionIndices().getDeviceBuffer());
pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionRowIndices().getDeviceBuffer());
pairEnergyKernel.setArg(index++, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL); pairEnergyKernel.setArg(index++, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL);
if (nb.getUseCutoff()) { if (nb.getUseCutoff()) {
pairEnergyKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer()); pairEnergyKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
...@@ -2520,10 +2525,10 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include ...@@ -2520,10 +2525,10 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
globals->upload(globalParamValues); globals->upload(globalParamValues);
} }
if (nb.getUseCutoff()) { if (nb.getUseCutoff()) {
pairValueKernel.setArg<mm_float4>(10, cl.getPeriodicBoxSize()); pairValueKernel.setArg<mm_float4>(11, cl.getPeriodicBoxSize());
pairValueKernel.setArg<mm_float4>(11, cl.getInvPeriodicBoxSize()); pairValueKernel.setArg<mm_float4>(12, cl.getInvPeriodicBoxSize());
pairEnergyKernel.setArg<mm_float4>(11, cl.getPeriodicBoxSize()); pairEnergyKernel.setArg<mm_float4>(12, cl.getPeriodicBoxSize());
pairEnergyKernel.setArg<mm_float4>(12, cl.getInvPeriodicBoxSize()); pairEnergyKernel.setArg<mm_float4>(13, cl.getInvPeriodicBoxSize());
} }
cl.executeKernel(pairValueKernel, nb.getTiles().getSize()*OpenCLContext::TileSize); cl.executeKernel(pairValueKernel, nb.getTiles().getSize()*OpenCLContext::TileSize);
cl.executeKernel(perParticleValueKernel, cl.getPaddedNumAtoms()); cl.executeKernel(perParticleValueKernel, cl.getPaddedNumAtoms());
......
...@@ -30,12 +30,14 @@ ...@@ -30,12 +30,14 @@
#include "OpenCLKernelSources.h" #include "OpenCLKernelSources.h"
#include "OpenCLExpressionUtilities.h" #include "OpenCLExpressionUtilities.h"
#include <map> #include <map>
#include <set>
#include <utility>
using namespace OpenMM; using namespace OpenMM;
using namespace std; using namespace std;
OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : context(context), cutoff(-1.0), useCutoff(false), OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : context(context), cutoff(-1.0), useCutoff(false),
numForceBuffers(0), tiles(NULL), exclusionIndex(NULL), exclusions(NULL), interactingTiles(NULL), interactionFlags(NULL), numForceBuffers(0), tiles(NULL), exclusionIndices(NULL), exclusionRowIndices(NULL), exclusions(NULL), interactingTiles(NULL), interactionFlags(NULL),
interactionCount(NULL), blockCenter(NULL), blockBoundingBox(NULL), compact(NULL) { interactionCount(NULL), blockCenter(NULL), blockBoundingBox(NULL), compact(NULL) {
// Decide how many force buffers to use. // Decide how many force buffers to use.
...@@ -52,8 +54,10 @@ OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : con ...@@ -52,8 +54,10 @@ OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : con
OpenCLNonbondedUtilities::~OpenCLNonbondedUtilities() { OpenCLNonbondedUtilities::~OpenCLNonbondedUtilities() {
if (tiles != NULL) if (tiles != NULL)
delete tiles; delete tiles;
if (exclusionIndex != NULL) if (exclusionIndices != NULL)
delete exclusionIndex; delete exclusionIndices;
if (exclusionRowIndices != NULL)
delete exclusionRowIndices;
if (exclusions != NULL) if (exclusions != NULL)
delete exclusions; delete exclusions;
if (interactingTiles != NULL) if (interactingTiles != NULL)
...@@ -153,16 +157,36 @@ void OpenCLNonbondedUtilities::initialize(const System& system) { ...@@ -153,16 +157,36 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
// Build a list of indices for the tiles with exclusions. // Build a list of indices for the tiles with exclusions.
exclusionIndex = new OpenCLArray<cl_uint>(context, numTiles, "exclusionIndex"); set<pair<int, int> > tilesWithExclusions;
vector<cl_uint> exclusionIndexVec(exclusionIndex->getSize()); for (int atom1 = 0; atom1 < (int) atomExclusions.size(); ++atom1) {
int numWithExclusions = 0; int x = atom1/OpenCLContext::TileSize;
for (int i = 0; i < numTiles; ++i) for (int j = 0; j < (int) atomExclusions[atom1].size(); ++j) {
if ((tileVec[i]&1) == 1) int atom2 = atomExclusions[atom1][j];
exclusionIndexVec[i] = (numWithExclusions++)*OpenCLContext::TileSize; int y = atom2/OpenCLContext::TileSize;
tilesWithExclusions.insert(make_pair(max(x, y), min(x, y)));
}
}
if (context.getPaddedNumAtoms() > context.getNumAtoms()) {
for (int i = 0; i < numAtomBlocks; ++i)
tilesWithExclusions.insert(make_pair(numAtomBlocks-1, i));
}
vector<cl_uint> exclusionRowIndicesVec(numAtomBlocks+1, 0);
vector<cl_uint> exclusionIndicesVec;
int currentRow = 0;
for (set<pair<int, int> >::const_iterator iter = tilesWithExclusions.begin(); iter != tilesWithExclusions.end(); ++iter) {
while (iter->first != currentRow)
exclusionRowIndicesVec[++currentRow] = exclusionIndicesVec.size();
exclusionIndicesVec.push_back(iter->second);
}
exclusionRowIndicesVec[++currentRow] = exclusionIndicesVec.size();
exclusionIndices = new OpenCLArray<cl_uint>(context, exclusionIndicesVec.size(), "exclusionIndices");
exclusionRowIndices = new OpenCLArray<cl_uint>(context, exclusionRowIndicesVec.size(), "exclusionRowIndices");
exclusionIndices->upload(exclusionIndicesVec);
exclusionRowIndices->upload(exclusionRowIndicesVec);
// Record the exclusion data. // Record the exclusion data.
exclusions = new OpenCLArray<cl_uint>(context, numWithExclusions*OpenCLContext::TileSize, "exclusions"); exclusions = new OpenCLArray<cl_uint>(context, tilesWithExclusions.size()*OpenCLContext::TileSize, "exclusions");
vector<cl_uint> exclusionVec(exclusions->getSize()); vector<cl_uint> exclusionVec(exclusions->getSize());
for (int i = 0; i < exclusions->getSize(); ++i) for (int i = 0; i < exclusions->getSize(); ++i)
exclusionVec[i] = 0xFFFFFFFF; exclusionVec[i] = 0xFFFFFFFF;
...@@ -174,12 +198,12 @@ void OpenCLNonbondedUtilities::initialize(const System& system) { ...@@ -174,12 +198,12 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
int y = atom2/OpenCLContext::TileSize; int y = atom2/OpenCLContext::TileSize;
int offset2 = atom2-y*OpenCLContext::TileSize; int offset2 = atom2-y*OpenCLContext::TileSize;
if (x > y) { if (x > y) {
int tile = x+y*numAtomBlocks-y*(y+1)/2; int index = findExclusionIndex(x, y, exclusionIndicesVec, exclusionRowIndicesVec);
exclusionVec[exclusionIndexVec[tile]+offset1] &= 0xFFFFFFFF-(1<<offset2); exclusionVec[index+offset1] &= 0xFFFFFFFF-(1<<offset2);
} }
else { else {
int tile = y+x*numAtomBlocks-x*(x+1)/2; int index = findExclusionIndex(y, x, exclusionIndicesVec, exclusionRowIndicesVec);
exclusionVec[exclusionIndexVec[tile]+offset2] &= 0xFFFFFFFF-(1<<offset1); exclusionVec[index+offset2] &= 0xFFFFFFFF-(1<<offset1);
} }
} }
} }
...@@ -193,19 +217,18 @@ void OpenCLNonbondedUtilities::initialize(const System& system) { ...@@ -193,19 +217,18 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
int y = atom2/OpenCLContext::TileSize; int y = atom2/OpenCLContext::TileSize;
int offset2 = atom2-y*OpenCLContext::TileSize; int offset2 = atom2-y*OpenCLContext::TileSize;
if (x >= y) { if (x >= y) {
int tile = x+y*numAtomBlocks-y*(y+1)/2; int index = findExclusionIndex(x, y, exclusionIndicesVec, exclusionRowIndicesVec);
exclusionVec[exclusionIndexVec[tile]+offset1] &= 0xFFFFFFFF-(1<<offset2); exclusionVec[index+offset1] &= 0xFFFFFFFF-(1<<offset2);
} }
if (y >= x) { if (y >= x) {
int tile = y+x*numAtomBlocks-x*(x+1)/2; int index = findExclusionIndex(y, x, exclusionIndicesVec, exclusionRowIndicesVec);
exclusionVec[exclusionIndexVec[tile]+offset2] &= 0xFFFFFFFF-(1<<offset1); exclusionVec[index+offset2] &= 0xFFFFFFFF-(1<<offset1);
} }
} }
} }
atomExclusions.clear(); // We won't use this again, so free the memory it used atomExclusions.clear(); // We won't use this again, so free the memory it used
tiles->upload(tileVec); tiles->upload(tileVec);
exclusions->upload(exclusionVec); exclusions->upload(exclusionVec);
exclusionIndex->upload(exclusionIndexVec);
// Create data structures for the neighbor list. // Create data structures for the neighbor list.
...@@ -252,6 +275,15 @@ void OpenCLNonbondedUtilities::initialize(const System& system) { ...@@ -252,6 +275,15 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
} }
} }
int OpenCLNonbondedUtilities::findExclusionIndex(int x, int y, const vector<cl_uint>& exclusionIndices, const vector<cl_uint>& exclusionRowIndices) {
int start = exclusionRowIndices[x];
int end = exclusionRowIndices[x+1];
for (int i = start; i < end; i++)
if (exclusionIndices[i] == y)
return i*OpenCLContext::TileSize;
throw OpenMMException("Internal error: exclusion in unexpected tile");
}
void OpenCLNonbondedUtilities::prepareInteractions() { void OpenCLNonbondedUtilities::prepareInteractions() {
if (!useCutoff) if (!useCutoff)
return; return;
...@@ -275,8 +307,8 @@ void OpenCLNonbondedUtilities::prepareInteractions() { ...@@ -275,8 +307,8 @@ void OpenCLNonbondedUtilities::prepareInteractions() {
void OpenCLNonbondedUtilities::computeInteractions() { void OpenCLNonbondedUtilities::computeInteractions() {
if (tiles != NULL) { if (tiles != NULL) {
if (useCutoff) { if (useCutoff) {
forceKernel.setArg<mm_float4>(10, context.getPeriodicBoxSize()); forceKernel.setArg<mm_float4>(11, context.getPeriodicBoxSize());
forceKernel.setArg<mm_float4>(11, context.getInvPeriodicBoxSize()); forceKernel.setArg<mm_float4>(12, context.getInvPeriodicBoxSize());
} }
context.executeKernel(forceKernel, tiles->getSize()*OpenCLContext::TileSize); context.executeKernel(forceKernel, tiles->getSize()*OpenCLContext::TileSize);
} }
...@@ -388,6 +420,7 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc ...@@ -388,6 +420,7 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
defines["CUTOFF_SQUARED"] = OpenCLExpressionUtilities::doubleToString(cutoff*cutoff); defines["CUTOFF_SQUARED"] = OpenCLExpressionUtilities::doubleToString(cutoff*cutoff);
defines["NUM_ATOMS"] = OpenCLExpressionUtilities::intToString(context.getNumAtoms()); defines["NUM_ATOMS"] = OpenCLExpressionUtilities::intToString(context.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = OpenCLExpressionUtilities::intToString(context.getPaddedNumAtoms()); defines["PADDED_NUM_ATOMS"] = OpenCLExpressionUtilities::intToString(context.getPaddedNumAtoms());
defines["NUM_BLOCKS"] = OpenCLExpressionUtilities::intToString(context.getNumAtomBlocks());
string file = (context.getSIMDWidth() == 32 ? OpenCLKernelSources::nonbonded_nvidia : OpenCLKernelSources::nonbonded_default); string file = (context.getSIMDWidth() == 32 ? OpenCLKernelSources::nonbonded_nvidia : OpenCLKernelSources::nonbonded_default);
cl::Program program = context.createProgram(context.replaceStrings(file, replacements), defines); cl::Program program = context.createProgram(context.replaceStrings(file, replacements), defines);
cl::Kernel kernel(program, "computeNonbonded"); cl::Kernel kernel(program, "computeNonbonded");
...@@ -399,7 +432,8 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc ...@@ -399,7 +432,8 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
kernel.setArg<cl::Buffer>(index++, context.getEnergyBuffer().getDeviceBuffer()); kernel.setArg<cl::Buffer>(index++, context.getEnergyBuffer().getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, context.getPosq().getDeviceBuffer()); kernel.setArg<cl::Buffer>(index++, context.getPosq().getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, exclusions->getDeviceBuffer()); kernel.setArg<cl::Buffer>(index++, exclusions->getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, exclusionIndex->getDeviceBuffer()); kernel.setArg<cl::Buffer>(index++, exclusionIndices->getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, exclusionRowIndices->getDeviceBuffer());
kernel.setArg(index++, OpenCLContext::ThreadBlockSize*localDataSize, NULL); kernel.setArg(index++, OpenCLContext::ThreadBlockSize*localDataSize, NULL);
kernel.setArg(index++, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL); kernel.setArg(index++, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL);
if (useCutoff) { if (useCutoff) {
......
...@@ -175,7 +175,13 @@ public: ...@@ -175,7 +175,13 @@ public:
* Get the array containing the index into the exclusion array for each tile. * Get the array containing the index into the exclusion array for each tile.
*/ */
OpenCLArray<cl_uint>& getExclusionIndices() { OpenCLArray<cl_uint>& getExclusionIndices() {
return *exclusionIndex; return *exclusionIndices;
}
/**
* Get the array listing where the exclusion data starts for each row.
*/
OpenCLArray<cl_uint>& getExclusionRowIndices() {
return *exclusionRowIndices;
} }
/** /**
* Create a Kernel for evaluating a nonbonded interaction. Cutoffs and periodic boundary conditions * Create a Kernel for evaluating a nonbonded interaction. Cutoffs and periodic boundary conditions
...@@ -190,14 +196,16 @@ public: ...@@ -190,14 +196,16 @@ public:
*/ */
cl::Kernel createInteractionKernel(const std::string& source, const std::vector<ParameterInfo>& params, const std::vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric) const; cl::Kernel createInteractionKernel(const std::string& source, const std::vector<ParameterInfo>& params, const std::vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric) const;
private: private:
static int findExclusionIndex(int x, int y, const std::vector<cl_uint>& exclusionIndices, const std::vector<cl_uint>& exclusionRowIndices);
OpenCLContext& context; OpenCLContext& context;
cl::Kernel forceKernel; cl::Kernel forceKernel;
cl::Kernel findBlockBoundsKernel; cl::Kernel findBlockBoundsKernel;
cl::Kernel findInteractingBlocksKernel; cl::Kernel findInteractingBlocksKernel;
cl::Kernel findInteractionsWithinBlocksKernel; cl::Kernel findInteractionsWithinBlocksKernel;
OpenCLArray<cl_uint>* tiles; OpenCLArray<cl_uint>* tiles;
OpenCLArray<cl_uint>* exclusionIndex;
OpenCLArray<cl_uint>* exclusions; OpenCLArray<cl_uint>* exclusions;
OpenCLArray<cl_uint>* exclusionIndices;
OpenCLArray<cl_uint>* exclusionRowIndices;
OpenCLArray<cl_uint>* interactingTiles; OpenCLArray<cl_uint>* interactingTiles;
OpenCLArray<cl_uint>* interactionFlags; OpenCLArray<cl_uint>* interactionFlags;
OpenCLArray<cl_uint>* interactionCount; OpenCLArray<cl_uint>* interactionCount;
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1))) __kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer, __local float4* local_force, void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer, __local float4* local_force,
__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions, __global unsigned int* exclusionIndices, __global float4* posq, __local float4* local_posq, __global unsigned int* exclusions, __global unsigned int* exclusionIndices,
__local float4* tempForceBuffer, __global unsigned int* tiles, __global unsigned int* exclusionRowIndices, __local float4* tempForceBuffer, __global unsigned int* tiles,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize __global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize
#else #else
...@@ -23,31 +23,54 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer ...@@ -23,31 +23,54 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0); unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
float energy = 0.0f; float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF; unsigned int lasty = 0xFFFFFFFF;
__local unsigned int exclusionRange[2];
__local int exclusionIndex[1];
DECLARE_TEMP_BUFFERS DECLARE_TEMP_BUFFERS
while (pos < end) { while (pos < end) {
// Extract the coordinates of this tile // Extract the coordinates of this tile
#ifdef USE_CUTOFF
unsigned int x = tiles[pos]; unsigned int x = tiles[pos];
unsigned int y = ((x >> 2) & 0x7fff)*TILE_SIZE; unsigned int y = ((x >> 2) & 0x7fff);
bool hasExclusions = (x & 0x1); x = (x>>17);
x = (x>>17)*TILE_SIZE; #else
unsigned int y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
unsigned int x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y++;
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
#endif
unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2); unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2);
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int forceBufferOffset = (tgx < TILE_SIZE/2 ? 0 : TILE_SIZE); unsigned int forceBufferOffset = (tgx < TILE_SIZE/2 ? 0 : TILE_SIZE);
unsigned int atom1 = x + tgx; unsigned int atom1 = x*TILE_SIZE + tgx;
float4 force = 0.0f; float4 force = 0.0f;
float4 posq1 = posq[atom1]; float4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile.
#ifdef USE_EXCLUSIONS
if (get_local_id(0) < 2)
exclusionRange[get_local_id(0)] = exclusionRowIndices[x+get_local_id(0)];
if (tgx == 0)
exclusionIndex[0] = -1;
barrier(CLK_LOCAL_MEM_FENCE);
for (int i = exclusionRange[0]+tgx; i < exclusionRange[1]; i += TILE_SIZE)
if (exclusionIndices[i] == y)
exclusionIndex[0] = i*TILE_SIZE;
barrier(CLK_LOCAL_MEM_FENCE);
bool hasExclusions = (exclusionIndex[0] > -1);
#endif
if (x == y) { if (x == y) {
// This tile is on the diagonal. // This tile is on the diagonal.
local_posq[get_local_id(0)] = posq1; local_posq[get_local_id(0)] = posq1;
LOAD_LOCAL_PARAMETERS_FROM_1 LOAD_LOCAL_PARAMETERS_FROM_1
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
unsigned int xi = x/TILE_SIZE;
unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*(xi+1)/2;
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[exclusionIndices[tile]+tgx] >> baseLocalAtom; unsigned int excl = exclusions[exclusionIndex[0]+tgx] >> baseLocalAtom;
#endif #endif
for (unsigned int j = 0; j < TILE_SIZE/2; j++) { for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
...@@ -67,7 +90,7 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer ...@@ -67,7 +90,7 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
#endif #endif
float r = SQRT(r2); float r = SQRT(r2);
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
atom2 = y+baseLocalAtom+j; atom2 = y*TILE_SIZE+baseLocalAtom+j;
float dEdR = 0.0f; float dEdR = 0.0f;
float tempEnergy = 0.0f; float tempEnergy = 0.0f;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) { if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
...@@ -94,9 +117,9 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer ...@@ -94,9 +117,9 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) { if (get_local_id(0) < TILE_SIZE) {
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset1 = x + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS; unsigned int offset1 = x*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else #else
unsigned int offset1 = x + tgx + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif #endif
forceBuffers[offset1].xyz += force.xyz+tempForceBuffer[get_local_id(0)+TILE_SIZE].xyz; forceBuffers[offset1].xyz += force.xyz+tempForceBuffer[get_local_id(0)+TILE_SIZE].xyz;
STORE_DERIVATIVES_1 STORE_DERIVATIVES_1
...@@ -106,7 +129,7 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer ...@@ -106,7 +129,7 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
// This is an off-diagonal tile. // This is an off-diagonal tile.
if (lasty != y && get_local_id(0) < TILE_SIZE) { if (lasty != y && get_local_id(0) < TILE_SIZE) {
unsigned int j = y + tgx; unsigned int j = y*TILE_SIZE + tgx;
local_posq[get_local_id(0)] = posq[j]; local_posq[get_local_id(0)] = posq[j];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
} }
...@@ -116,11 +139,8 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer ...@@ -116,11 +139,8 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
// Compute the full set of interactions in this tile. // Compute the full set of interactions in this tile.
unsigned int xi = x/TILE_SIZE;
unsigned int yi = y/TILE_SIZE;
unsigned int tile = xi+yi*PADDED_NUM_ATOMS/TILE_SIZE-yi*(yi+1)/2;
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndices[tile]+tgx] : 0xFFFFFFFF); unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[0]+tgx] : 0xFFFFFFFF);
excl = (excl >> baseLocalAtom) & 0xFFFF; excl = (excl >> baseLocalAtom) & 0xFFFF;
excl += excl << 16; excl += excl << 16;
excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx)); excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
...@@ -144,7 +164,7 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer ...@@ -144,7 +164,7 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
#endif #endif
float r = SQRT(r2); float r = SQRT(r2);
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
atom2 = y+baseLocalAtom+tj; atom2 = y*TILE_SIZE+baseLocalAtom+tj;
float dEdR = 0.0f; float dEdR = 0.0f;
float tempEnergy = 0.0f; float tempEnergy = 0.0f;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) { if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
...@@ -176,11 +196,11 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer ...@@ -176,11 +196,11 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) { if (get_local_id(0) < TILE_SIZE) {
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset1 = x + tgx + (y/TILE_SIZE)*PADDED_NUM_ATOMS; unsigned int offset1 = x*TILE_SIZE + tgx + y*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS; unsigned int offset2 = y*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else #else
unsigned int offset1 = x + tgx + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif #endif
forceBuffers[offset1].xyz += force.xyz+tempForceBuffer[get_local_id(0)+TILE_SIZE].xyz; forceBuffers[offset1].xyz += force.xyz+tempForceBuffer[get_local_id(0)+TILE_SIZE].xyz;
forceBuffers[offset2].xyz += local_force[get_local_id(0)].xyz+local_force[get_local_id(0)+TILE_SIZE].xyz; forceBuffers[offset2].xyz += local_force[get_local_id(0)].xyz+local_force[get_local_id(0)+TILE_SIZE].xyz;
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1))) __kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer, __local float4* local_force, void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer, __local float4* local_force,
__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions, __global unsigned int* exclusionIndices, __global float4* posq, __local float4* local_posq, __global unsigned int* exclusions, __global unsigned int* exclusionIndices,
__local float4* tempBuffer, __global unsigned int* tiles, __global unsigned int* exclusionRowIndices, __local float4* tempBuffer, __global unsigned int* tiles,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize __global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize
#else #else
...@@ -25,28 +25,52 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer ...@@ -25,28 +25,52 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
unsigned int end = (warp+1)*numTiles/totalWarps; unsigned int end = (warp+1)*numTiles/totalWarps;
float energy = 0.0f; float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF; unsigned int lasty = 0xFFFFFFFF;
__local unsigned int exclusionRange[4];
__local int exclusionIndex[2];
while (pos < end) { while (pos < end) {
// Extract the coordinates of this tile // Extract the coordinates of this tile
#ifdef USE_CUTOFF
unsigned int x = tiles[pos]; unsigned int x = tiles[pos];
unsigned int y = ((x >> 2) & 0x7fff)*TILE_SIZE; unsigned int y = ((x >> 2) & 0x7fff);
bool hasExclusions = (x & 0x1); x = (x>>17);
x = (x>>17)*TILE_SIZE; #else
unsigned int y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
unsigned int x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y++;
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
#endif
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int tbx = get_local_id(0) - tgx; unsigned int tbx = get_local_id(0) - tgx;
unsigned int atom1 = x + tgx; unsigned int atom1 = x*TILE_SIZE + tgx;
float4 force = 0.0f; float4 force = 0.0f;
float4 posq1 = posq[atom1]; float4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile.
#ifdef USE_EXCLUSIONS
int localGroupIndex = get_local_id(0)/TILE_SIZE;
if (tgx < 2)
exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
if (tgx == 0)
exclusionIndex[localGroupIndex] = -1;
for (int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
if (exclusionIndices[i] == y)
exclusionIndex[localGroupIndex] = i*TILE_SIZE;
bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
#else
bool hasExclusions = false;
#endif
if (x == y) { if (x == y) {
// This tile is on the diagonal. // This tile is on the diagonal.
local_posq[get_local_id(0)] = posq1; local_posq[get_local_id(0)] = posq1;
LOAD_LOCAL_PARAMETERS_FROM_1 LOAD_LOCAL_PARAMETERS_FROM_1
unsigned int xi = x/TILE_SIZE;
unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*(xi+1)/2;
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[exclusionIndices[tile]+tgx]; unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
#endif #endif
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
...@@ -66,7 +90,7 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer ...@@ -66,7 +90,7 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
#endif #endif
float r = SQRT(r2); float r = SQRT(r2);
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
atom2 = y+j; atom2 = y*TILE_SIZE+j;
float dEdR = 0.0f; float dEdR = 0.0f;
float tempEnergy = 0.0f; float tempEnergy = 0.0f;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) { if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
...@@ -86,9 +110,9 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer ...@@ -86,9 +110,9 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
// Write results // Write results
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset1 = x + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS; unsigned int offset1 = x*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else #else
unsigned int offset1 = x + tgx + warp*PADDED_NUM_ATOMS; unsigned int offset1 = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
#endif #endif
forceBuffers[offset1].xyz += force.xyz; forceBuffers[offset1].xyz += force.xyz;
STORE_DERIVATIVES_1 STORE_DERIVATIVES_1
...@@ -97,7 +121,7 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer ...@@ -97,7 +121,7 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
// This is an off-diagonal tile. // This is an off-diagonal tile.
if (lasty != y) { if (lasty != y) {
unsigned int j = y + tgx; unsigned int j = y*TILE_SIZE + tgx;
local_posq[get_local_id(0)] = posq[j]; local_posq[get_local_id(0)] = posq[j];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
} }
...@@ -113,11 +137,8 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer ...@@ -113,11 +137,8 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
{ {
// Compute the full set of interactions in this tile. // Compute the full set of interactions in this tile.
unsigned int xi = x/TILE_SIZE;
unsigned int yi = y/TILE_SIZE;
unsigned int tile = xi+yi*PADDED_NUM_ATOMS/TILE_SIZE-yi*(yi+1)/2;
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndices[tile]+tgx] : 0xFFFFFFFF); unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx)); excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
#endif #endif
unsigned int tj = tgx; unsigned int tj = tgx;
...@@ -139,7 +160,7 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer ...@@ -139,7 +160,7 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
#endif #endif
float r = SQRT(r2); float r = SQRT(r2);
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
atom2 = y+tj; atom2 = y*TILE_SIZE+tj;
float dEdR = 0.0f; float dEdR = 0.0f;
float tempEnergy = 0.0f; float tempEnergy = 0.0f;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) { if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
...@@ -164,11 +185,11 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer ...@@ -164,11 +185,11 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
// Write results // Write results
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset1 = x + tgx + (y/TILE_SIZE)*PADDED_NUM_ATOMS; unsigned int offset1 = x*TILE_SIZE + tgx + y*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS; unsigned int offset2 = y*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else #else
unsigned int offset1 = x + tgx + warp*PADDED_NUM_ATOMS; unsigned int offset1 = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + warp*PADDED_NUM_ATOMS; unsigned int offset2 = y*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
#endif #endif
forceBuffers[offset1].xyz += force.xyz; forceBuffers[offset1].xyz += force.xyz;
forceBuffers[offset2].xyz += local_force[get_local_id(0)].xyz; forceBuffers[offset2].xyz += local_force[get_local_id(0)].xyz;
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1))) __kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeN2Value(__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions, void computeN2Value(__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions,
__global unsigned int* exclusionIndices, __global float* global_value, __local float* local_value, __global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __global float* global_value, __local float* local_value,
__local float* tempBuffer, __global unsigned int* tiles, __local float* tempBuffer, __global unsigned int* tiles,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize __global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize
...@@ -21,30 +21,53 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global ...@@ -21,30 +21,53 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0); unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
float energy = 0.0f; float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF; unsigned int lasty = 0xFFFFFFFF;
__local unsigned int exclusionRange[2];
__local int exclusionIndex[1];
while (pos < end) { while (pos < end) {
// Extract the coordinates of this tile // Extract the coordinates of this tile
#ifdef USE_CUTOFF
unsigned int x = tiles[pos]; unsigned int x = tiles[pos];
unsigned int y = ((x >> 2) & 0x7fff)*TILE_SIZE; unsigned int y = ((x >> 2) & 0x7fff);
bool hasExclusions = (x & 0x1); x = (x>>17);
x = (x>>17)*TILE_SIZE; #else
unsigned int y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
unsigned int x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y++;
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
#endif
unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2); unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2);
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int valueBufferOffset = (tgx < TILE_SIZE/2 ? 0 : TILE_SIZE); unsigned int valueBufferOffset = (tgx < TILE_SIZE/2 ? 0 : TILE_SIZE);
unsigned int atom1 = x + tgx; unsigned int atom1 = x*TILE_SIZE + tgx;
float value = 0.0f; float value = 0.0f;
float4 posq1 = posq[atom1]; float4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile.
#ifdef USE_EXCLUSIONS
if (get_local_id(0) < 2)
exclusionRange[get_local_id(0)] = exclusionRowIndices[x+get_local_id(0)];
if (tgx == 0)
exclusionIndex[0] = -1;
barrier(CLK_LOCAL_MEM_FENCE);
for (int i = exclusionRange[0]+tgx; i < exclusionRange[1]; i += TILE_SIZE)
if (exclusionIndices[i] == y)
exclusionIndex[0] = i*TILE_SIZE;
barrier(CLK_LOCAL_MEM_FENCE);
bool hasExclusions = (exclusionIndex[0] > -1);
#endif
if (x == y) { if (x == y) {
// This tile is on the diagonal. // This tile is on the diagonal.
local_posq[get_local_id(0)] = posq1; local_posq[get_local_id(0)] = posq1;
LOAD_LOCAL_PARAMETERS_FROM_1 LOAD_LOCAL_PARAMETERS_FROM_1
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
unsigned int xi = x/TILE_SIZE;
unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*(xi+1)/2;
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[exclusionIndices[tile]+tgx] >> baseLocalAtom; unsigned int excl = exclusions[exclusionIndex[0]+tgx] >> baseLocalAtom;
#endif #endif
for (unsigned int j = 0; j < TILE_SIZE/2; j++) { for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
...@@ -64,7 +87,7 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global ...@@ -64,7 +87,7 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
#endif #endif
float r = SQRT(r2); float r = SQRT(r2);
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
atom2 = y+baseLocalAtom+j; atom2 = y*TILE_SIZE+baseLocalAtom+j;
float tempValue1 = 0.0f; float tempValue1 = 0.0f;
float tempValue2 = 0.0f; float tempValue2 = 0.0f;
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
...@@ -90,9 +113,9 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global ...@@ -90,9 +113,9 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) { if (get_local_id(0) < TILE_SIZE) {
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset = x + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS; unsigned int offset = x*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else #else
unsigned int offset = x + tgx + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif #endif
global_value[offset] += value+tempBuffer[get_local_id(0)+TILE_SIZE]; global_value[offset] += value+tempBuffer[get_local_id(0)+TILE_SIZE];
} }
...@@ -101,7 +124,7 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global ...@@ -101,7 +124,7 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
// This is an off-diagonal tile. // This is an off-diagonal tile.
if (lasty != y && get_local_id(0) < TILE_SIZE) { if (lasty != y && get_local_id(0) < TILE_SIZE) {
unsigned int j = y + tgx; unsigned int j = y*TILE_SIZE + tgx;
local_posq[get_local_id(0)] = posq[j]; local_posq[get_local_id(0)] = posq[j];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
} }
...@@ -110,11 +133,8 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global ...@@ -110,11 +133,8 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
// Compute the full set of interactions in this tile. // Compute the full set of interactions in this tile.
unsigned int xi = x/TILE_SIZE;
unsigned int yi = y/TILE_SIZE;
unsigned int tile = xi+yi*PADDED_NUM_ATOMS/TILE_SIZE-yi*(yi+1)/2;
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndices[tile]+tgx] : 0xFFFFFFFF); unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[0]+tgx] : 0xFFFFFFFF);
excl = (excl >> baseLocalAtom) & 0xFFFF; excl = (excl >> baseLocalAtom) & 0xFFFF;
excl += excl << 16; excl += excl << 16;
excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx)); excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
...@@ -138,7 +158,7 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global ...@@ -138,7 +158,7 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
#endif #endif
float r = SQRT(r2); float r = SQRT(r2);
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
atom2 = y+baseLocalAtom+tj; atom2 = y*TILE_SIZE+baseLocalAtom+tj;
float tempValue1 = 0.0f; float tempValue1 = 0.0f;
float tempValue2 = 0.0f; float tempValue2 = 0.0f;
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
...@@ -167,11 +187,11 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global ...@@ -167,11 +187,11 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) { if (get_local_id(0) < TILE_SIZE) {
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset1 = x + tgx + (y/TILE_SIZE)*PADDED_NUM_ATOMS; unsigned int offset1 = x*TILE_SIZE + tgx + y*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS; unsigned int offset2 = y*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else #else
unsigned int offset1 = x + tgx + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif #endif
global_value[offset1] += value+tempBuffer[get_local_id(0)+TILE_SIZE]; global_value[offset1] += value+tempBuffer[get_local_id(0)+TILE_SIZE];
global_value[offset2] += local_value[get_local_id(0)]+local_value[get_local_id(0)+TILE_SIZE]; global_value[offset2] += local_value[get_local_id(0)]+local_value[get_local_id(0)+TILE_SIZE];
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1))) __kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeN2Value(__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions, void computeN2Value(__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions,
__global unsigned int* exclusionIndices, __global float* global_value, __local float* local_value, __global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __global float* global_value, __local float* local_value,
__local float* tempBuffer, __global unsigned int* tiles, __local float* tempBuffer, __global unsigned int* tiles,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize __global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize
...@@ -23,28 +23,52 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global ...@@ -23,28 +23,52 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
unsigned int end = (warp+1)*numTiles/totalWarps; unsigned int end = (warp+1)*numTiles/totalWarps;
float energy = 0.0f; float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF; unsigned int lasty = 0xFFFFFFFF;
__local unsigned int exclusionRange[4];
__local int exclusionIndex[2];
while (pos < end) { while (pos < end) {
// Extract the coordinates of this tile // Extract the coordinates of this tile
#ifdef USE_CUTOFF
unsigned int x = tiles[pos]; unsigned int x = tiles[pos];
unsigned int y = ((x >> 2) & 0x7fff)*TILE_SIZE; unsigned int y = ((x >> 2) & 0x7fff);
bool hasExclusions = (x & 0x1); x = (x>>17);
x = (x>>17)*TILE_SIZE; #else
unsigned int y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
unsigned int x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y++;
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
#endif
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int tbx = get_local_id(0) - tgx; unsigned int tbx = get_local_id(0) - tgx;
unsigned int atom1 = x + tgx; unsigned int atom1 = x*TILE_SIZE + tgx;
float value = 0.0f; float value = 0.0f;
float4 posq1 = posq[atom1]; float4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile.
#ifdef USE_EXCLUSIONS
int localGroupIndex = get_local_id(0)/TILE_SIZE;
if (tgx < 2)
exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
if (tgx == 0)
exclusionIndex[localGroupIndex] = -1;
for (int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
if (exclusionIndices[i] == y)
exclusionIndex[localGroupIndex] = i*TILE_SIZE;
bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
#else
bool hasExclusions = false;
#endif
if (x == y) { if (x == y) {
// This tile is on the diagonal. // This tile is on the diagonal.
local_posq[get_local_id(0)] = posq1; local_posq[get_local_id(0)] = posq1;
LOAD_LOCAL_PARAMETERS_FROM_1 LOAD_LOCAL_PARAMETERS_FROM_1
unsigned int xi = x/TILE_SIZE;
unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*(xi+1)/2;
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[exclusionIndices[tile]+tgx]; unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
#endif #endif
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
...@@ -64,7 +88,7 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global ...@@ -64,7 +88,7 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
#endif #endif
float r = SQRT(r2); float r = SQRT(r2);
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
atom2 = y+j; atom2 = y*TILE_SIZE+j;
float tempValue1 = 0.0f; float tempValue1 = 0.0f;
float tempValue2 = 0.0f; float tempValue2 = 0.0f;
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
...@@ -85,9 +109,9 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global ...@@ -85,9 +109,9 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
// Write results // Write results
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset = x + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS; unsigned int offset = x*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else #else
unsigned int offset = x + tgx + warp*PADDED_NUM_ATOMS; unsigned int offset = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
#endif #endif
global_value[offset] += value; global_value[offset] += value;
} }
...@@ -95,7 +119,7 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global ...@@ -95,7 +119,7 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
// This is an off-diagonal tile. // This is an off-diagonal tile.
if (lasty != y) { if (lasty != y) {
unsigned int j = y + tgx; unsigned int j = y*TILE_SIZE + tgx;
local_posq[get_local_id(0)] = posq[j]; local_posq[get_local_id(0)] = posq[j];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
} }
...@@ -125,7 +149,7 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global ...@@ -125,7 +149,7 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
if (r2 < CUTOFF_SQUARED) { if (r2 < CUTOFF_SQUARED) {
float r = SQRT(r2); float r = SQRT(r2);
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
atom2 = y+j; atom2 = y*TILE_SIZE+j;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) { if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
COMPUTE_VALUE COMPUTE_VALUE
} }
...@@ -154,11 +178,8 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global ...@@ -154,11 +178,8 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
{ {
// Compute the full set of interactions in this tile. // Compute the full set of interactions in this tile.
unsigned int xi = x/TILE_SIZE;
unsigned int yi = y/TILE_SIZE;
unsigned int tile = xi+yi*PADDED_NUM_ATOMS/TILE_SIZE-yi*(yi+1)/2;
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndices[tile]+tgx] : 0xFFFFFFFF); unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx)); excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
#endif #endif
unsigned int tj = tgx; unsigned int tj = tgx;
...@@ -180,7 +201,7 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global ...@@ -180,7 +201,7 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
#endif #endif
float r = SQRT(r2); float r = SQRT(r2);
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
atom2 = y+tj; atom2 = y*TILE_SIZE+tj;
float tempValue1 = 0.0f; float tempValue1 = 0.0f;
float tempValue2 = 0.0f; float tempValue2 = 0.0f;
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
...@@ -204,11 +225,11 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global ...@@ -204,11 +225,11 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
// Write results // Write results
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset1 = x + tgx + (y/TILE_SIZE)*PADDED_NUM_ATOMS; unsigned int offset1 = x*TILE_SIZE + tgx + y*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS; unsigned int offset2 = y*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else #else
unsigned int offset1 = x + tgx + warp*PADDED_NUM_ATOMS; unsigned int offset1 = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + warp*PADDED_NUM_ATOMS; unsigned int offset2 = y*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
#endif #endif
global_value[offset1] += value; global_value[offset1] += value;
global_value[offset2] += local_value[get_local_id(0)]; global_value[offset2] += local_value[get_local_id(0)];
......
...@@ -31,13 +31,22 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo ...@@ -31,13 +31,22 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
while (pos < end) { while (pos < end) {
// Extract the coordinates of this tile // Extract the coordinates of this tile
#ifdef USE_CUTOFF
unsigned int x = tiles[pos]; unsigned int x = tiles[pos];
unsigned int y = ((x >> 2) & 0x7fff)*TILE_SIZE; unsigned int y = ((x >> 2) & 0x7fff);
x = (x>>17)*TILE_SIZE; x = (x>>17);
#else
unsigned int y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
unsigned int x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y++;
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
#endif
unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2); unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2);
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int forceBufferOffset = (tgx < TILE_SIZE/2 ? 0 : TILE_SIZE); unsigned int forceBufferOffset = (tgx < TILE_SIZE/2 ? 0 : TILE_SIZE);
unsigned int atom1 = x + tgx; unsigned int atom1 = x*TILE_SIZE + tgx;
float bornSum = 0.0f; float bornSum = 0.0f;
float4 posq1 = posq[atom1]; float4 posq1 = posq[atom1];
float2 params1 = global_params[atom1]; float2 params1 = global_params[atom1];
...@@ -51,8 +60,6 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo ...@@ -51,8 +60,6 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
localData[get_local_id(0)].radius = params1.x; localData[get_local_id(0)].radius = params1.x;
localData[get_local_id(0)].scaledRadius = params1.y; localData[get_local_id(0)].scaledRadius = params1.y;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
unsigned int xi = x/TILE_SIZE;
unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*(xi+1)/2;
for (unsigned int j = 0; j < TILE_SIZE/2; j++) { for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
float4 delta = (float4) (localData[baseLocalAtom+j].x-posq1.x, localData[baseLocalAtom+j].y-posq1.y, localData[baseLocalAtom+j].z-posq1.z, 0.0f); float4 delta = (float4) (localData[baseLocalAtom+j].x-posq1.x, localData[baseLocalAtom+j].y-posq1.y, localData[baseLocalAtom+j].z-posq1.z, 0.0f);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
...@@ -66,9 +73,9 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo ...@@ -66,9 +73,9 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
float2 params2 = (float2) (localData[baseLocalAtom+j].radius, localData[baseLocalAtom+j].scaledRadius); float2 params2 = (float2) (localData[baseLocalAtom+j].radius, localData[baseLocalAtom+j].scaledRadius);
float rScaledRadiusJ = r+params2.y; float rScaledRadiusJ = r+params2.y;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int includeInteraction = (atom1 < NUM_ATOMS && y+baseLocalAtom+j < NUM_ATOMS && r2 < CUTOFF_SQUARED && (j+baseLocalAtom != tgx) && (params1.x < rScaledRadiusJ)); unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+baseLocalAtom+j < NUM_ATOMS && r2 < CUTOFF_SQUARED && (j+baseLocalAtom != tgx) && (params1.x < rScaledRadiusJ));
#else #else
unsigned int includeInteraction = (atom1 < NUM_ATOMS && y+baseLocalAtom+j < NUM_ATOMS && (j+baseLocalAtom != tgx) && (params1.x < rScaledRadiusJ)); unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+baseLocalAtom+j < NUM_ATOMS && (j+baseLocalAtom != tgx) && (params1.x < rScaledRadiusJ));
#endif #endif
float l_ij = RECIP(max(params1.x, fabs(r-params2.y))); float l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
float u_ij = RECIP(rScaledRadiusJ); float u_ij = RECIP(rScaledRadiusJ);
...@@ -87,9 +94,9 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo ...@@ -87,9 +94,9 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) { if (get_local_id(0) < TILE_SIZE) {
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset = x + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS; unsigned int offset = x*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else #else
unsigned int offset = x + tgx + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif #endif
global_bornSum[offset] += bornSum+tempBuffer[get_local_id(0)+TILE_SIZE]; global_bornSum[offset] += bornSum+tempBuffer[get_local_id(0)+TILE_SIZE];
} }
...@@ -98,7 +105,7 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo ...@@ -98,7 +105,7 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
// This is an off-diagonal tile. // This is an off-diagonal tile.
if (lasty != y && get_local_id(0) < TILE_SIZE) { if (lasty != y && get_local_id(0) < TILE_SIZE) {
unsigned int j = y + tgx; unsigned int j = y*TILE_SIZE + tgx;
float4 tempPosq = posq[j]; float4 tempPosq = posq[j];
localData[get_local_id(0)].x = tempPosq.x; localData[get_local_id(0)].x = tempPosq.x;
localData[get_local_id(0)].y = tempPosq.y; localData[get_local_id(0)].y = tempPosq.y;
...@@ -113,9 +120,6 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo ...@@ -113,9 +120,6 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
// Compute the full set of interactions in this tile. // Compute the full set of interactions in this tile.
unsigned int xi = x/TILE_SIZE;
unsigned int yi = y/TILE_SIZE;
unsigned int tile = xi+yi*PADDED_NUM_ATOMS/TILE_SIZE-yi*(yi+1)/2;
unsigned int tj = tgx%(TILE_SIZE/2); unsigned int tj = tgx%(TILE_SIZE/2);
for (unsigned int j = 0; j < TILE_SIZE/2; j++) { for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
float4 delta = (float4) (localData[baseLocalAtom+tj].x-posq1.x, localData[baseLocalAtom+tj].y-posq1.y, localData[baseLocalAtom+tj].z-posq1.z, 0.0f); float4 delta = (float4) (localData[baseLocalAtom+tj].x-posq1.x, localData[baseLocalAtom+tj].y-posq1.y, localData[baseLocalAtom+tj].z-posq1.z, 0.0f);
...@@ -126,9 +130,9 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo ...@@ -126,9 +130,9 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
#endif #endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int includeInteraction = (atom1 < NUM_ATOMS && y+baseLocalAtom+tj < NUM_ATOMS && r2 < CUTOFF_SQUARED); unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+baseLocalAtom+tj < NUM_ATOMS && r2 < CUTOFF_SQUARED);
#else #else
unsigned int includeInteraction = (atom1 < NUM_ATOMS && y+baseLocalAtom+tj < NUM_ATOMS); unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+baseLocalAtom+tj < NUM_ATOMS);
#endif #endif
float invR = RSQRT(r2); float invR = RSQRT(r2);
float r = RECIP(invR); float r = RECIP(invR);
...@@ -168,11 +172,11 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo ...@@ -168,11 +172,11 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) { if (get_local_id(0) < TILE_SIZE) {
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset1 = x + tgx + (y/TILE_SIZE)*PADDED_NUM_ATOMS; unsigned int offset1 = x*TILE_SIZE + tgx + y*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS; unsigned int offset2 = y*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else #else
unsigned int offset1 = x + tgx + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif #endif
global_bornSum[offset1] += bornSum+tempBuffer[get_local_id(0)+TILE_SIZE]; global_bornSum[offset1] += bornSum+tempBuffer[get_local_id(0)+TILE_SIZE];
global_bornSum[offset2] += localData[get_local_id(0)].bornSum+localData[get_local_id(0)+TILE_SIZE].bornSum; global_bornSum[offset2] += localData[get_local_id(0)].bornSum+localData[get_local_id(0)+TILE_SIZE].bornSum;
...@@ -206,13 +210,22 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff ...@@ -206,13 +210,22 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
while (pos < end) { while (pos < end) {
// Extract the coordinates of this tile // Extract the coordinates of this tile
#ifdef USE_CUTOFF
unsigned int x = tiles[pos]; unsigned int x = tiles[pos];
unsigned int y = ((x >> 2) & 0x7fff)*TILE_SIZE; unsigned int y = ((x >> 2) & 0x7fff);
x = (x>>17)*TILE_SIZE; x = (x>>17);
#else
unsigned int y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
unsigned int x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y++;
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
#endif
unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2); unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2);
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int forceBufferOffset = (tgx < TILE_SIZE/2 ? 0 : TILE_SIZE); unsigned int forceBufferOffset = (tgx < TILE_SIZE/2 ? 0 : TILE_SIZE);
unsigned int atom1 = x + tgx; unsigned int atom1 = x*TILE_SIZE + tgx;
float4 force = 0.0f; float4 force = 0.0f;
float4 posq1 = posq[atom1]; float4 posq1 = posq[atom1];
float bornRadius1 = global_bornRadii[atom1]; float bornRadius1 = global_bornRadii[atom1];
...@@ -225,10 +238,8 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff ...@@ -225,10 +238,8 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
localData[get_local_id(0)].q = posq1.w; localData[get_local_id(0)].q = posq1.w;
localData[get_local_id(0)].bornRadius = bornRadius1; localData[get_local_id(0)].bornRadius = bornRadius1;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
unsigned int xi = x/TILE_SIZE;
unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*(xi+1)/2;
for (unsigned int j = 0; j < TILE_SIZE/2; j++) { for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
unsigned int includeInteraction = (atom1 < NUM_ATOMS && y+baseLocalAtom+j < NUM_ATOMS); unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+baseLocalAtom+j < NUM_ATOMS);
float4 posq2 = (float4) (localData[baseLocalAtom+j].x, localData[baseLocalAtom+j].y, localData[baseLocalAtom+j].z, localData[baseLocalAtom+j].q); float4 posq2 = (float4) (localData[baseLocalAtom+j].x, localData[baseLocalAtom+j].y, localData[baseLocalAtom+j].z, localData[baseLocalAtom+j].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
...@@ -266,9 +277,9 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff ...@@ -266,9 +277,9 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) { if (get_local_id(0) < TILE_SIZE) {
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset = x + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS; unsigned int offset = x*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else #else
unsigned int offset = x + tgx + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif #endif
forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz+tempBuffer[get_local_id(0)+TILE_SIZE].xyz; forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz+tempBuffer[get_local_id(0)+TILE_SIZE].xyz;
global_bornForce[offset] += force.w+tempBuffer[get_local_id(0)+TILE_SIZE].w; global_bornForce[offset] += force.w+tempBuffer[get_local_id(0)+TILE_SIZE].w;
...@@ -278,7 +289,7 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff ...@@ -278,7 +289,7 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
// This is an off-diagonal tile. // This is an off-diagonal tile.
if (lasty != y && get_local_id(0) < TILE_SIZE) { if (lasty != y && get_local_id(0) < TILE_SIZE) {
unsigned int j = y + tgx; unsigned int j = y*TILE_SIZE + tgx;
float4 tempPosq = posq[j]; float4 tempPosq = posq[j];
localData[get_local_id(0)].x = tempPosq.x; localData[get_local_id(0)].x = tempPosq.x;
localData[get_local_id(0)].y = tempPosq.y; localData[get_local_id(0)].y = tempPosq.y;
...@@ -294,12 +305,9 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff ...@@ -294,12 +305,9 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
// Compute the full set of interactions in this tile. // Compute the full set of interactions in this tile.
unsigned int xi = x/TILE_SIZE;
unsigned int yi = y/TILE_SIZE;
unsigned int tile = xi+yi*PADDED_NUM_ATOMS/TILE_SIZE-yi*(yi+1)/2;
unsigned int tj = tgx%(TILE_SIZE/2); unsigned int tj = tgx%(TILE_SIZE/2);
for (unsigned int j = 0; j < TILE_SIZE/2; j++) { for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
unsigned int includeInteraction = (atom1 < NUM_ATOMS && y+baseLocalAtom+tj < NUM_ATOMS); unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+baseLocalAtom+tj < NUM_ATOMS);
float4 posq2 = (float4) (localData[baseLocalAtom+tj].x, localData[baseLocalAtom+tj].y, localData[baseLocalAtom+tj].z, localData[baseLocalAtom+tj].q); float4 posq2 = (float4) (localData[baseLocalAtom+tj].x, localData[baseLocalAtom+tj].y, localData[baseLocalAtom+tj].z, localData[baseLocalAtom+tj].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
...@@ -343,11 +351,11 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff ...@@ -343,11 +351,11 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) { if (get_local_id(0) < TILE_SIZE) {
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset1 = x + tgx + (y/TILE_SIZE)*PADDED_NUM_ATOMS; unsigned int offset1 = x*TILE_SIZE + tgx + y*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS; unsigned int offset2 = y*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else #else
unsigned int offset1 = x + tgx + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif #endif
forceBuffers[offset1].xyz = forceBuffers[offset1].xyz+force.xyz+tempBuffer[get_local_id(0)+TILE_SIZE].xyz; forceBuffers[offset1].xyz = forceBuffers[offset1].xyz+force.xyz+tempBuffer[get_local_id(0)+TILE_SIZE].xyz;
float4 sum = (float4) (localData[get_local_id(0)].fx+localData[get_local_id(0)+TILE_SIZE].fx, float4 sum = (float4) (localData[get_local_id(0)].fx+localData[get_local_id(0)+TILE_SIZE].fx,
......
...@@ -33,12 +33,21 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo ...@@ -33,12 +33,21 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
while (pos < end) { while (pos < end) {
// Extract the coordinates of this tile // Extract the coordinates of this tile
#ifdef USE_CUTOFF
unsigned int x = tiles[pos]; unsigned int x = tiles[pos];
unsigned int y = ((x >> 2) & 0x7fff)*TILE_SIZE; unsigned int y = ((x >> 2) & 0x7fff);
x = (x>>17)*TILE_SIZE; x = (x>>17);
#else
unsigned int y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
unsigned int x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y++;
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
#endif
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int tbx = get_local_id(0) - tgx; unsigned int tbx = get_local_id(0) - tgx;
unsigned int atom1 = x + tgx; unsigned int atom1 = x*TILE_SIZE + tgx;
float bornSum = 0.0f; float bornSum = 0.0f;
float4 posq1 = posq[atom1]; float4 posq1 = posq[atom1];
float2 params1 = global_params[atom1]; float2 params1 = global_params[atom1];
...@@ -51,8 +60,6 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo ...@@ -51,8 +60,6 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
localData[get_local_id(0)].q = posq1.w; localData[get_local_id(0)].q = posq1.w;
localData[get_local_id(0)].radius = params1.x; localData[get_local_id(0)].radius = params1.x;
localData[get_local_id(0)].scaledRadius = params1.y; localData[get_local_id(0)].scaledRadius = params1.y;
unsigned int xi = x/TILE_SIZE;
unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*(xi+1)/2;
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
float4 delta = (float4) (localData[tbx+j].x-posq1.x, localData[tbx+j].y-posq1.y, localData[tbx+j].z-posq1.z, 0.0f); float4 delta = (float4) (localData[tbx+j].x-posq1.x, localData[tbx+j].y-posq1.y, localData[tbx+j].z-posq1.z, 0.0f);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
...@@ -62,9 +69,9 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo ...@@ -62,9 +69,9 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
#endif #endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (atom1 < NUM_ATOMS && y+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) { if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else #else
if (atom1 < NUM_ATOMS && y+j < NUM_ATOMS) { if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
#endif #endif
float invR = RSQRT(r2); float invR = RSQRT(r2);
float r = RECIP(invR); float r = RECIP(invR);
...@@ -86,9 +93,9 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo ...@@ -86,9 +93,9 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
// Write results // Write results
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset = x + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS; unsigned int offset = x*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else #else
unsigned int offset = x + tgx + warp*PADDED_NUM_ATOMS; unsigned int offset = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
#endif #endif
global_bornSum[offset] += bornSum; global_bornSum[offset] += bornSum;
} }
...@@ -96,7 +103,7 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo ...@@ -96,7 +103,7 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
// This is an off-diagonal tile. // This is an off-diagonal tile.
if (lasty != y) { if (lasty != y) {
unsigned int j = y + tgx; unsigned int j = y*TILE_SIZE + tgx;
float4 tempPosq = posq[j]; float4 tempPosq = posq[j];
localData[get_local_id(0)].x = tempPosq.x; localData[get_local_id(0)].x = tempPosq.x;
localData[get_local_id(0)].y = tempPosq.y; localData[get_local_id(0)].y = tempPosq.y;
...@@ -127,9 +134,9 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo ...@@ -127,9 +134,9 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
tempBuffer[get_local_id(0)] = 0.0f; tempBuffer[get_local_id(0)] = 0.0f;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (atom1 < NUM_ATOMS && y+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) { if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else #else
if (atom1 < NUM_ATOMS && y+j < NUM_ATOMS) { if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
#endif #endif
float invR = RSQRT(r2); float invR = RSQRT(r2);
float r = RECIP(invR); float r = RECIP(invR);
...@@ -182,9 +189,6 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo ...@@ -182,9 +189,6 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
{ {
// Compute the full set of interactions in this tile. // Compute the full set of interactions in this tile.
unsigned int xi = x/TILE_SIZE;
unsigned int yi = y/TILE_SIZE;
unsigned int tile = xi+yi*PADDED_NUM_ATOMS/TILE_SIZE-yi*(yi+1)/2;
unsigned int tj = tgx; unsigned int tj = tgx;
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
float4 delta = (float4) (localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z, 0.0f); float4 delta = (float4) (localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z, 0.0f);
...@@ -195,9 +199,9 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo ...@@ -195,9 +199,9 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
#endif #endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (atom1 < NUM_ATOMS && y+tj < NUM_ATOMS && r2 < CUTOFF_SQUARED) { if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else #else
if (atom1 < NUM_ATOMS && y+tj < NUM_ATOMS) { if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS) {
#endif #endif
float invR = RSQRT(r2); float invR = RSQRT(r2);
float r = RECIP(invR); float r = RECIP(invR);
...@@ -235,11 +239,11 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo ...@@ -235,11 +239,11 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
// Write results // Write results
float4 of; float4 of;
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset1 = x + tgx + (y/TILE_SIZE)*PADDED_NUM_ATOMS; unsigned int offset1 = x*TILE_SIZE + tgx + y*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS; unsigned int offset2 = y*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else #else
unsigned int offset1 = x + tgx + warp*PADDED_NUM_ATOMS; unsigned int offset1 = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + warp*PADDED_NUM_ATOMS; unsigned int offset2 = y*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
#endif #endif
global_bornSum[offset1] += bornSum; global_bornSum[offset1] += bornSum;
global_bornSum[offset2] += localData[get_local_id(0)].bornSum; global_bornSum[offset2] += localData[get_local_id(0)].bornSum;
...@@ -274,12 +278,21 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff ...@@ -274,12 +278,21 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
while (pos < end) { while (pos < end) {
// Extract the coordinates of this tile // Extract the coordinates of this tile
#ifdef USE_CUTOFF
unsigned int x = tiles[pos]; unsigned int x = tiles[pos];
unsigned int y = ((x >> 2) & 0x7fff)*TILE_SIZE; unsigned int y = ((x >> 2) & 0x7fff);
x = (x>>17)*TILE_SIZE; x = (x>>17);
#else
unsigned int y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
unsigned int x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y++;
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
#endif
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int tbx = get_local_id(0) - tgx; unsigned int tbx = get_local_id(0) - tgx;
unsigned int atom1 = x + tgx; unsigned int atom1 = x*TILE_SIZE + tgx;
float4 force = 0.0f; float4 force = 0.0f;
float4 posq1 = posq[atom1]; float4 posq1 = posq[atom1];
float bornRadius1 = global_bornRadii[atom1]; float bornRadius1 = global_bornRadii[atom1];
...@@ -291,10 +304,8 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff ...@@ -291,10 +304,8 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
localData[get_local_id(0)].z = posq1.z; localData[get_local_id(0)].z = posq1.z;
localData[get_local_id(0)].q = posq1.w; localData[get_local_id(0)].q = posq1.w;
localData[get_local_id(0)].bornRadius = bornRadius1; localData[get_local_id(0)].bornRadius = bornRadius1;
unsigned int xi = x/TILE_SIZE;
unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*(xi+1)/2;
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
if (atom1 < NUM_ATOMS && y+j < NUM_ATOMS) { if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
float4 posq2 = (float4) (localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z, localData[tbx+j].q); float4 posq2 = (float4) (localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z, localData[tbx+j].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
...@@ -330,9 +341,9 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff ...@@ -330,9 +341,9 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
// Write results // Write results
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset = x + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS; unsigned int offset = x*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else #else
unsigned int offset = x + tgx + warp*PADDED_NUM_ATOMS; unsigned int offset = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
#endif #endif
forceBuffers[offset].xyz += force.xyz; forceBuffers[offset].xyz += force.xyz;
global_bornForce[offset] += force.w; global_bornForce[offset] += force.w;
...@@ -341,7 +352,7 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff ...@@ -341,7 +352,7 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
// This is an off-diagonal tile. // This is an off-diagonal tile.
if (lasty != y) { if (lasty != y) {
unsigned int j = y + tgx; unsigned int j = y*TILE_SIZE + tgx;
float4 tempPosq = posq[j]; float4 tempPosq = posq[j];
localData[get_local_id(0)].x = tempPosq.x; localData[get_local_id(0)].x = tempPosq.x;
localData[get_local_id(0)].y = tempPosq.y; localData[get_local_id(0)].y = tempPosq.y;
...@@ -386,9 +397,9 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff ...@@ -386,9 +397,9 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
force.w += dGpol_dalpha2_ij*bornRadius2; force.w += dGpol_dalpha2_ij*bornRadius2;
float dEdR = Gpol*(1.0f - 0.25f*expTerm); float dEdR = Gpol*(1.0f - 0.25f*expTerm);
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (atom1 >= NUM_ATOMS || y+j >= NUM_ATOMS || r2 > CUTOFF_SQUARED) { if (atom1 >= NUM_ATOMS || y*TILE_SIZE+j >= NUM_ATOMS || r2 > CUTOFF_SQUARED) {
#else #else
if (atom1 >= NUM_ATOMS || y+j >= NUM_ATOMS) { if (atom1 >= NUM_ATOMS || y*TILE_SIZE+j >= NUM_ATOMS) {
#endif #endif
dEdR = 0.0f; dEdR = 0.0f;
tempEnergy = 0.0f; tempEnergy = 0.0f;
...@@ -424,12 +435,9 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff ...@@ -424,12 +435,9 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
{ {
// Compute the full set of interactions in this tile. // Compute the full set of interactions in this tile.
unsigned int xi = x/TILE_SIZE;
unsigned int yi = y/TILE_SIZE;
unsigned int tile = xi+yi*PADDED_NUM_ATOMS/TILE_SIZE-yi*(yi+1)/2;
unsigned int tj = tgx; unsigned int tj = tgx;
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
if (atom1 < NUM_ATOMS && y+tj < NUM_ATOMS) { if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS) {
float4 posq2 = (float4) (localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z, localData[tbx+tj].q); float4 posq2 = (float4) (localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z, localData[tbx+tj].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
...@@ -471,11 +479,11 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff ...@@ -471,11 +479,11 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
// Write results // Write results
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset1 = x + tgx + (y/TILE_SIZE)*PADDED_NUM_ATOMS; unsigned int offset1 = x*TILE_SIZE + tgx + y*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS; unsigned int offset2 = y*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else #else
unsigned int offset1 = x + tgx + warp*PADDED_NUM_ATOMS; unsigned int offset1 = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + warp*PADDED_NUM_ATOMS; unsigned int offset2 = y*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
#endif #endif
forceBuffers[offset1].xyz += force.xyz; forceBuffers[offset1].xyz += force.xyz;
forceBuffers[offset2] += (float4) (localData[get_local_id(0)].fx, localData[get_local_id(0)].fy, localData[get_local_id(0)].fz, 0); forceBuffers[offset2] += (float4) (localData[get_local_id(0)].fx, localData[get_local_id(0)].fy, localData[get_local_id(0)].fz, 0);
......
...@@ -13,7 +13,7 @@ typedef struct { ...@@ -13,7 +13,7 @@ typedef struct {
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1))) __kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions, void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions,
__global unsigned int* exclusionIndices, __local AtomData* localData, __local float4* tempBuffer, __global unsigned int* tiles, __global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __local AtomData* localData, __local float4* tempBuffer, __global unsigned int* tiles,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize __global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize
#else #else
...@@ -27,20 +27,45 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe ...@@ -27,20 +27,45 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0); unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
float energy = 0.0f; float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF; unsigned int lasty = 0xFFFFFFFF;
__local unsigned int exclusionRange[2];
__local int exclusionIndex[1];
while (pos < end) { while (pos < end) {
// Extract the coordinates of this tile // Extract the coordinates of this tile
#ifdef USE_CUTOFF
unsigned int x = tiles[pos]; unsigned int x = tiles[pos];
unsigned int y = ((x >> 2) & 0x7fff)*TILE_SIZE; unsigned int y = ((x >> 2) & 0x7fff);
bool hasExclusions = (x & 0x1); x = (x>>17);
x = (x>>17)*TILE_SIZE; #else
unsigned int y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
unsigned int x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y++;
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
#endif
unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2); unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2);
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int forceBufferOffset = (tgx < TILE_SIZE/2 ? 0 : TILE_SIZE); unsigned int forceBufferOffset = (tgx < TILE_SIZE/2 ? 0 : TILE_SIZE);
unsigned int atom1 = x + tgx; unsigned int atom1 = x*TILE_SIZE + tgx;
float4 force = 0.0f; float4 force = 0.0f;
float4 posq1 = posq[atom1]; float4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile.
#ifdef USE_EXCLUSIONS
if (get_local_id(0) < 2)
exclusionRange[get_local_id(0)] = exclusionRowIndices[x+get_local_id(0)];
if (tgx == 0)
exclusionIndex[0] = -1;
barrier(CLK_LOCAL_MEM_FENCE);
for (int i = exclusionRange[0]+tgx; i < exclusionRange[1]; i += TILE_SIZE)
if (exclusionIndices[i] == y)
exclusionIndex[0] = i*TILE_SIZE;
barrier(CLK_LOCAL_MEM_FENCE);
bool hasExclusions = (exclusionIndex[0] > -1);
#endif
if (x == y) { if (x == y) {
// This tile is on the diagonal. // This tile is on the diagonal.
...@@ -50,10 +75,8 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe ...@@ -50,10 +75,8 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
localData[get_local_id(0)].q = posq1.w; localData[get_local_id(0)].q = posq1.w;
LOAD_LOCAL_PARAMETERS_FROM_1 LOAD_LOCAL_PARAMETERS_FROM_1
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
unsigned int xi = x/TILE_SIZE;
unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*(xi+1)/2;
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[exclusionIndices[tile]+tgx] >> baseLocalAtom; unsigned int excl = exclusions[exclusionIndex[0]+tgx] >> baseLocalAtom;
#endif #endif
for (unsigned int j = 0; j < TILE_SIZE/2; j++) { for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
...@@ -71,7 +94,7 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe ...@@ -71,7 +94,7 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
float invR = RSQRT(r2); float invR = RSQRT(r2);
float r = RECIP(invR); float r = RECIP(invR);
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
atom2 = y+baseLocalAtom+j; atom2 = y*TILE_SIZE+baseLocalAtom+j;
#ifdef USE_SYMMETRIC #ifdef USE_SYMMETRIC
float dEdR = 0.0f; float dEdR = 0.0f;
#else #else
...@@ -96,9 +119,9 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe ...@@ -96,9 +119,9 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) { if (get_local_id(0) < TILE_SIZE) {
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset = x + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS; unsigned int offset = x*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else #else
unsigned int offset = x + tgx + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif #endif
forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz+tempBuffer[get_local_id(0)+TILE_SIZE].xyz; forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz+tempBuffer[get_local_id(0)+TILE_SIZE].xyz;
} }
...@@ -107,7 +130,7 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe ...@@ -107,7 +130,7 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
// This is an off-diagonal tile. // This is an off-diagonal tile.
if (lasty != y && get_local_id(0) < TILE_SIZE) { if (lasty != y && get_local_id(0) < TILE_SIZE) {
unsigned int j = y + tgx; unsigned int j = y*TILE_SIZE + tgx;
float4 tempPosq = posq[j]; float4 tempPosq = posq[j];
localData[get_local_id(0)].x = tempPosq.x; localData[get_local_id(0)].x = tempPosq.x;
localData[get_local_id(0)].y = tempPosq.y; localData[get_local_id(0)].y = tempPosq.y;
...@@ -122,11 +145,8 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe ...@@ -122,11 +145,8 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
// Compute the full set of interactions in this tile. // Compute the full set of interactions in this tile.
unsigned int xi = x/TILE_SIZE;
unsigned int yi = y/TILE_SIZE;
unsigned int tile = xi+yi*PADDED_NUM_ATOMS/TILE_SIZE-yi*(yi+1)/2;
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndices[tile]+tgx] : 0xFFFFFFFF); unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[0]+tgx] : 0xFFFFFFFF);
excl = (excl >> baseLocalAtom) & 0xFFFF; excl = (excl >> baseLocalAtom) & 0xFFFF;
excl += excl << 16; excl += excl << 16;
excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx)); excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
...@@ -148,7 +168,7 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe ...@@ -148,7 +168,7 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
float invR = RSQRT(r2); float invR = RSQRT(r2);
float r = RECIP(invR); float r = RECIP(invR);
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
atom2 = y+baseLocalAtom+tj; atom2 = y*TILE_SIZE+baseLocalAtom+tj;
#ifdef USE_SYMMETRIC #ifdef USE_SYMMETRIC
float dEdR = 0.0f; float dEdR = 0.0f;
#else #else
...@@ -182,11 +202,11 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe ...@@ -182,11 +202,11 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) { if (get_local_id(0) < TILE_SIZE) {
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset1 = x + tgx + (y/TILE_SIZE)*PADDED_NUM_ATOMS; unsigned int offset1 = x*TILE_SIZE + tgx + y*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS; unsigned int offset2 = y*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else #else
unsigned int offset1 = x + tgx + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif #endif
forceBuffers[offset1].xyz = forceBuffers[offset1].xyz+force.xyz+tempBuffer[get_local_id(0)+TILE_SIZE].xyz; forceBuffers[offset1].xyz = forceBuffers[offset1].xyz+force.xyz+tempBuffer[get_local_id(0)+TILE_SIZE].xyz;
float4 sum = (float4) (localData[get_local_id(0)].fx+localData[get_local_id(0)+TILE_SIZE].fx, localData[get_local_id(0)].fy+localData[get_local_id(0)+TILE_SIZE].fy, localData[get_local_id(0)].fz+localData[get_local_id(0)+TILE_SIZE].fz, 0.0f); float4 sum = (float4) (localData[get_local_id(0)].fx+localData[get_local_id(0)+TILE_SIZE].fx, localData[get_local_id(0)].fy+localData[get_local_id(0)+TILE_SIZE].fy, localData[get_local_id(0)].fz+localData[get_local_id(0)+TILE_SIZE].fz, 0.0f);
......
...@@ -13,7 +13,7 @@ typedef struct { ...@@ -13,7 +13,7 @@ typedef struct {
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1))) __kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions, void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions,
__global unsigned int* exclusionIndices, __local AtomData* localData, __local float4* tempBuffer, __global unsigned int* tiles, __global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __local AtomData* localData, __local float4* tempBuffer, __global unsigned int* tiles,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize __global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize
#else #else
...@@ -29,19 +29,45 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe ...@@ -29,19 +29,45 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
unsigned int end = (warp+1)*numTiles/totalWarps; unsigned int end = (warp+1)*numTiles/totalWarps;
float energy = 0.0f; float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF; unsigned int lasty = 0xFFFFFFFF;
__local unsigned int exclusionRange[4];
__local int exclusionIndex[2];
while (pos < end) { while (pos < end) {
// Extract the coordinates of this tile // Extract the coordinates of this tile
#ifdef USE_CUTOFF
unsigned int x = tiles[pos]; unsigned int x = tiles[pos];
unsigned int y = ((x >> 2) & 0x7fff)*TILE_SIZE; unsigned int y = ((x >> 2) & 0x7fff);
bool hasExclusions = (x & 0x1); x = (x>>17);
x = (x>>17)*TILE_SIZE; #else
unsigned int y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
unsigned int x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y++;
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
#endif
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int tbx = get_local_id(0) - tgx; unsigned int tbx = get_local_id(0) - tgx;
unsigned int atom1 = x + tgx; unsigned int atom1 = x*TILE_SIZE + tgx;
float4 force = 0.0f; float4 force = 0.0f;
float4 posq1 = posq[atom1]; float4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile.
#ifdef USE_EXCLUSIONS
int localGroupIndex = get_local_id(0)/TILE_SIZE;
if (tgx < 2)
exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
if (tgx == 0)
exclusionIndex[localGroupIndex] = -1;
for (int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
if (exclusionIndices[i] == y)
exclusionIndex[localGroupIndex] = i*TILE_SIZE;
bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
#else
bool hasExclusions = false;
#endif
if (x == y) { if (x == y) {
// This tile is on the diagonal. // This tile is on the diagonal.
...@@ -50,10 +76,8 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe ...@@ -50,10 +76,8 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
localData[get_local_id(0)].z = posq1.z; localData[get_local_id(0)].z = posq1.z;
localData[get_local_id(0)].q = posq1.w; localData[get_local_id(0)].q = posq1.w;
LOAD_LOCAL_PARAMETERS_FROM_1 LOAD_LOCAL_PARAMETERS_FROM_1
unsigned int xi = x/TILE_SIZE;
unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*(xi+1)/2;
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[exclusionIndices[tile]+tgx]; unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
#endif #endif
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
...@@ -71,7 +95,7 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe ...@@ -71,7 +95,7 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
float r = sqrt(r2); float r = sqrt(r2);
float invR = RECIP(r); float invR = RECIP(r);
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
atom2 = y+j; atom2 = y*TILE_SIZE+j;
#ifdef USE_SYMMETRIC #ifdef USE_SYMMETRIC
float dEdR = 0.0f; float dEdR = 0.0f;
#else #else
...@@ -91,9 +115,9 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe ...@@ -91,9 +115,9 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
// Write results // Write results
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset = x + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS; unsigned int offset = x*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else #else
unsigned int offset = x + tgx + warp*PADDED_NUM_ATOMS; unsigned int offset = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
#endif #endif
forceBuffers[offset].xyz += force.xyz; forceBuffers[offset].xyz += force.xyz;
} }
...@@ -101,7 +125,7 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe ...@@ -101,7 +125,7 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
// This is an off-diagonal tile. // This is an off-diagonal tile.
if (lasty != y) { if (lasty != y) {
unsigned int j = y + tgx; unsigned int j = y*TILE_SIZE + tgx;
float4 tempPosq = posq[j]; float4 tempPosq = posq[j];
localData[get_local_id(0)].x = tempPosq.x; localData[get_local_id(0)].x = tempPosq.x;
localData[get_local_id(0)].y = tempPosq.y; localData[get_local_id(0)].y = tempPosq.y;
...@@ -136,7 +160,7 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe ...@@ -136,7 +160,7 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
float invR = RSQRT(r2); float invR = RSQRT(r2);
float r = RECIP(invR); float r = RECIP(invR);
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
atom2 = y+j; atom2 = y*TILE_SIZE+j;
#ifdef USE_SYMMETRIC #ifdef USE_SYMMETRIC
float dEdR = 0.0f; float dEdR = 0.0f;
#else #else
...@@ -179,11 +203,8 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe ...@@ -179,11 +203,8 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
{ {
// Compute the full set of interactions in this tile. // Compute the full set of interactions in this tile.
unsigned int xi = x/TILE_SIZE;
unsigned int yi = y/TILE_SIZE;
unsigned int tile = xi+yi*PADDED_NUM_ATOMS/TILE_SIZE-yi*(yi+1)/2;
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndices[tile]+tgx] : 0xFFFFFFFF); unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx)); excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
#endif #endif
unsigned int tj = tgx; unsigned int tj = tgx;
...@@ -203,7 +224,7 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe ...@@ -203,7 +224,7 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
float invR = RSQRT(r2); float invR = RSQRT(r2);
float r = RECIP(invR); float r = RECIP(invR);
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
atom2 = y+tj; atom2 = y*TILE_SIZE+tj;
#ifdef USE_SYMMETRIC #ifdef USE_SYMMETRIC
float dEdR = 0.0f; float dEdR = 0.0f;
#else #else
...@@ -232,11 +253,11 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe ...@@ -232,11 +253,11 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
// Write results // Write results
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset1 = x + tgx + (y/TILE_SIZE)*PADDED_NUM_ATOMS; unsigned int offset1 = x*TILE_SIZE + tgx + y*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + (x/TILE_SIZE)*PADDED_NUM_ATOMS; unsigned int offset2 = y*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else #else
unsigned int offset1 = x + tgx + warp*PADDED_NUM_ATOMS; unsigned int offset1 = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + warp*PADDED_NUM_ATOMS; unsigned int offset2 = y*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
#endif #endif
forceBuffers[offset1].xyz += force.xyz; forceBuffers[offset1].xyz += force.xyz;
forceBuffers[offset2] += (float4) (localData[get_local_id(0)].fx, localData[get_local_id(0)].fy, localData[get_local_id(0)].fz, 0.0f); forceBuffers[offset2] += (float4) (localData[get_local_id(0)].fx, localData[get_local_id(0)].fy, localData[get_local_id(0)].fz, 0.0f);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment