Commit 839f9a81 authored by Peter Eastman's avatar Peter Eastman
Browse files

Finished changes to reduce memory use for large systems

parent a8fe9cea
...@@ -1691,6 +1691,8 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF ...@@ -1691,6 +1691,8 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
defines["NUM_ATOMS"] = intToString(cl.getNumAtoms()); defines["NUM_ATOMS"] = intToString(cl.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = intToString(cl.getPaddedNumAtoms()); defines["PADDED_NUM_ATOMS"] = intToString(cl.getPaddedNumAtoms());
defines["NUM_BLOCKS"] = OpenCLExpressionUtilities::intToString(cl.getNumAtomBlocks()); defines["NUM_BLOCKS"] = OpenCLExpressionUtilities::intToString(cl.getNumAtomBlocks());
if (nb.getUseCutoff())
defines["MAX_TILES"] = OpenCLExpressionUtilities::intToString(nb.getInteractingTiles().getSize());
string file = (cl.getSIMDWidth() == 32 ? OpenCLKernelSources::gbsaObc_nvidia : OpenCLKernelSources::gbsaObc_default); string file = (cl.getSIMDWidth() == 32 ? OpenCLKernelSources::gbsaObc_nvidia : OpenCLKernelSources::gbsaObc_default);
cl::Program program = cl.createProgram(file, defines); cl::Program program = cl.createProgram(file, defines);
int index = 0; int index = 0;
...@@ -1702,8 +1704,9 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF ...@@ -1702,8 +1704,9 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
computeBornSumKernel.setArg(index++, OpenCLContext::ThreadBlockSize*sizeof(cl_float), NULL); computeBornSumKernel.setArg(index++, OpenCLContext::ThreadBlockSize*sizeof(cl_float), NULL);
if (nb.getUseCutoff()) { if (nb.getUseCutoff()) {
computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer()); computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getInteractionFlags().getDeviceBuffer());
computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer()); computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
if (cl.getSIMDWidth() == 32)
computeBornSumKernel.setArg<cl::Buffer>(index+2, nb.getInteractionFlags().getDeviceBuffer());
} }
else else
computeBornSumKernel.setArg<cl_uint>(index++, cl.getNumAtomBlocks()*(cl.getNumAtomBlocks()+1)/2); computeBornSumKernel.setArg<cl_uint>(index++, cl.getNumAtomBlocks()*(cl.getNumAtomBlocks()+1)/2);
...@@ -1718,8 +1721,9 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF ...@@ -1718,8 +1721,9 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
force1Kernel.setArg(index++, OpenCLContext::ThreadBlockSize*sizeof(mm_float4), NULL); force1Kernel.setArg(index++, OpenCLContext::ThreadBlockSize*sizeof(mm_float4), NULL);
if (nb.getUseCutoff()) { if (nb.getUseCutoff()) {
force1Kernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer()); force1Kernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(index++, nb.getInteractionFlags().getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer()); force1Kernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
if (cl.getSIMDWidth() == 32)
force1Kernel.setArg<cl::Buffer>(index+2, nb.getInteractionFlags().getDeviceBuffer());
} }
else else
force1Kernel.setArg<cl_uint>(index++, cl.getNumAtomBlocks()*(cl.getNumAtomBlocks()+1)/2); force1Kernel.setArg<cl_uint>(index++, cl.getNumAtomBlocks()*(cl.getNumAtomBlocks()+1)/2);
...@@ -1744,10 +1748,10 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF ...@@ -1744,10 +1748,10 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
reduceBornForceKernel.setArg<cl::Buffer>(6, obcChain->getDeviceBuffer()); reduceBornForceKernel.setArg<cl::Buffer>(6, obcChain->getDeviceBuffer());
} }
if (nb.getUseCutoff()) { if (nb.getUseCutoff()) {
computeBornSumKernel.setArg<mm_float4>(8, cl.getPeriodicBoxSize()); computeBornSumKernel.setArg<mm_float4>(7, cl.getPeriodicBoxSize());
computeBornSumKernel.setArg<mm_float4>(9, cl.getInvPeriodicBoxSize()); computeBornSumKernel.setArg<mm_float4>(8, cl.getInvPeriodicBoxSize());
force1Kernel.setArg<mm_float4>(10, cl.getPeriodicBoxSize()); force1Kernel.setArg<mm_float4>(9, cl.getPeriodicBoxSize());
force1Kernel.setArg<mm_float4>(11, cl.getInvPeriodicBoxSize()); force1Kernel.setArg<mm_float4>(10, cl.getInvPeriodicBoxSize());
} }
int numTiles = cl.getNumAtomBlocks()*(cl.getNumAtomBlocks()+1)/2; int numTiles = cl.getNumAtomBlocks()*(cl.getNumAtomBlocks()+1)/2;
cl.executeKernel(computeBornSumKernel, numTiles*OpenCLContext::TileSize); cl.executeKernel(computeBornSumKernel, numTiles*OpenCLContext::TileSize);
...@@ -2399,9 +2403,11 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include ...@@ -2399,9 +2403,11 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
pairValueKernel.setArg(index++, OpenCLContext::ThreadBlockSize*sizeof(cl_float), NULL); pairValueKernel.setArg(index++, OpenCLContext::ThreadBlockSize*sizeof(cl_float), NULL);
if (nb.getUseCutoff()) { if (nb.getUseCutoff()) {
pairValueKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer()); pairValueKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
pairValueKernel.setArg<cl::Buffer>(index++, nb.getInteractionFlags().getDeviceBuffer());
pairValueKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer()); pairValueKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
index += 2; // Periodic box size arguments are set when the kernel is executed. index += 2; // Periodic box size arguments are set when the kernel is executed.
pairValueKernel.setArg<cl_uint>(index++, nb.getInteractingTiles().getSize());
if (cl.getSIMDWidth() == 32)
pairValueKernel.setArg<cl::Buffer>(index++, nb.getInteractionFlags().getDeviceBuffer());
} }
else else
pairValueKernel.setArg<cl_uint>(index++, cl.getNumAtomBlocks()*(cl.getNumAtomBlocks()+1)/2); pairValueKernel.setArg<cl_uint>(index++, cl.getNumAtomBlocks()*(cl.getNumAtomBlocks()+1)/2);
...@@ -2445,9 +2451,11 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include ...@@ -2445,9 +2451,11 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
pairEnergyKernel.setArg(index++, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL); pairEnergyKernel.setArg(index++, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL);
if (nb.getUseCutoff()) { if (nb.getUseCutoff()) {
pairEnergyKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer()); pairEnergyKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
pairEnergyKernel.setArg<cl::Buffer>(index++, nb.getInteractionFlags().getDeviceBuffer());
pairEnergyKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer()); pairEnergyKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
index += 2; // Periodic box size arguments are set when the kernel is executed. index += 2; // Periodic box size arguments are set when the kernel is executed.
pairEnergyKernel.setArg<cl_uint>(index++, nb.getInteractingTiles().getSize());
if (cl.getSIMDWidth() == 32)
pairEnergyKernel.setArg<cl::Buffer>(index++, nb.getInteractionFlags().getDeviceBuffer());
} }
else else
pairEnergyKernel.setArg<cl_uint>(index++, cl.getNumAtomBlocks()*(cl.getNumAtomBlocks()+1)/2); pairEnergyKernel.setArg<cl_uint>(index++, cl.getNumAtomBlocks()*(cl.getNumAtomBlocks()+1)/2);
...@@ -2518,10 +2526,10 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include ...@@ -2518,10 +2526,10 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
globals->upload(globalParamValues); globals->upload(globalParamValues);
} }
if (nb.getUseCutoff()) { if (nb.getUseCutoff()) {
pairValueKernel.setArg<mm_float4>(11, cl.getPeriodicBoxSize()); pairValueKernel.setArg<mm_float4>(10, cl.getPeriodicBoxSize());
pairValueKernel.setArg<mm_float4>(12, cl.getInvPeriodicBoxSize()); pairValueKernel.setArg<mm_float4>(11, cl.getInvPeriodicBoxSize());
pairEnergyKernel.setArg<mm_float4>(12, cl.getPeriodicBoxSize()); pairEnergyKernel.setArg<mm_float4>(11, cl.getPeriodicBoxSize());
pairEnergyKernel.setArg<mm_float4>(13, cl.getInvPeriodicBoxSize()); pairEnergyKernel.setArg<mm_float4>(12, cl.getInvPeriodicBoxSize());
} }
int numTiles = cl.getNumAtomBlocks()*(cl.getNumAtomBlocks()+1)/2; int numTiles = cl.getNumAtomBlocks()*(cl.getNumAtomBlocks()+1)/2;
cl.executeKernel(pairValueKernel, numTiles*OpenCLContext::TileSize); cl.executeKernel(pairValueKernel, numTiles*OpenCLContext::TileSize);
......
...@@ -200,8 +200,18 @@ void OpenCLNonbondedUtilities::initialize(const System& system) { ...@@ -200,8 +200,18 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
// Create data structures for the neighbor list. // Create data structures for the neighbor list.
if (useCutoff) { if (useCutoff) {
interactingTiles = new OpenCLArray<mm_ushort2>(context, numTiles, "interactingTiles"); // Select a size for the arrays that hold the neighbor list. This estimate is intentionally very
interactionFlags = new OpenCLArray<cl_uint>(context, numTiles, "interactionFlags"); // high, because if it ever is too small, we have to fall back to the N^2 algorithm.
mm_float4 boxSize = context.getPeriodicBoxSize();
int maxInteractingTiles = (int) (numTiles*(cutoff/boxSize.x+cutoff/boxSize.y+cutoff/boxSize.z));
if (maxInteractingTiles < 1)
maxInteractingTiles = 1;
if (maxInteractingTiles > numTiles)
maxInteractingTiles = numTiles;
interactingTiles = new OpenCLArray<mm_ushort2>(context, maxInteractingTiles, "interactingTiles");
if (context.getSIMDWidth() == 32)
interactionFlags = new OpenCLArray<cl_uint>(context, maxInteractingTiles, "interactionFlags");
interactionCount = new OpenCLArray<cl_uint>(context, 1, "interactionCount"); interactionCount = new OpenCLArray<cl_uint>(context, 1, "interactionCount");
blockCenter = new OpenCLArray<mm_float4>(context, numAtomBlocks, "blockCenter"); blockCenter = new OpenCLArray<mm_float4>(context, numAtomBlocks, "blockCenter");
blockBoundingBox = new OpenCLArray<mm_float4>(context, numAtomBlocks, "blockBoundingBox"); blockBoundingBox = new OpenCLArray<mm_float4>(context, numAtomBlocks, "blockBoundingBox");
...@@ -213,6 +223,7 @@ void OpenCLNonbondedUtilities::initialize(const System& system) { ...@@ -213,6 +223,7 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
if (useCutoff) { if (useCutoff) {
map<string, string> defines; map<string, string> defines;
defines["NUM_BLOCKS"] = OpenCLExpressionUtilities::intToString(context.getNumAtomBlocks()); defines["NUM_BLOCKS"] = OpenCLExpressionUtilities::intToString(context.getNumAtomBlocks());
defines["MAX_TILES"] = OpenCLExpressionUtilities::intToString(interactingTiles->getSize());
if (forceBufferPerAtomBlock) if (forceBufferPerAtomBlock)
defines["USE_OUTPUT_BUFFER_PER_BLOCK"] = "1"; defines["USE_OUTPUT_BUFFER_PER_BLOCK"] = "1";
if (usePeriodic) if (usePeriodic)
...@@ -231,15 +242,17 @@ void OpenCLNonbondedUtilities::initialize(const System& system) { ...@@ -231,15 +242,17 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
findInteractingBlocksKernel.setArg<cl::Buffer>(5, interactionCount->getDeviceBuffer()); findInteractingBlocksKernel.setArg<cl::Buffer>(5, interactionCount->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(6, interactingTiles->getDeviceBuffer()); findInteractingBlocksKernel.setArg<cl::Buffer>(6, interactingTiles->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(7, context.getPosq().getDeviceBuffer()); findInteractingBlocksKernel.setArg<cl::Buffer>(7, context.getPosq().getDeviceBuffer());
findInteractionsWithinBlocksKernel = cl::Kernel(interactingBlocksProgram, "findInteractionsWithinBlocks"); if (context.getSIMDWidth() == 32) {
findInteractionsWithinBlocksKernel.setArg<cl_float>(0, (cl_float) (cutoff*cutoff)); findInteractionsWithinBlocksKernel = cl::Kernel(interactingBlocksProgram, "findInteractionsWithinBlocks");
findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(3, context.getPosq().getDeviceBuffer()); findInteractionsWithinBlocksKernel.setArg<cl_float>(0, (cl_float) (cutoff*cutoff));
findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(4, interactingTiles->getDeviceBuffer()); findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(3, context.getPosq().getDeviceBuffer());
findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(5, blockCenter->getDeviceBuffer()); findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(4, interactingTiles->getDeviceBuffer());
findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(6, blockBoundingBox->getDeviceBuffer()); findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(5, blockCenter->getDeviceBuffer());
findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(7, interactionFlags->getDeviceBuffer()); findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(6, blockBoundingBox->getDeviceBuffer());
findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(8, interactionCount->getDeviceBuffer()); findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(7, interactionFlags->getDeviceBuffer());
findInteractionsWithinBlocksKernel.setArg(9, OpenCLContext::ThreadBlockSize*sizeof(cl_uint), NULL); findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(8, interactionCount->getDeviceBuffer());
findInteractionsWithinBlocksKernel.setArg(9, OpenCLContext::ThreadBlockSize*sizeof(cl_uint), NULL);
}
} }
} }
...@@ -264,6 +277,8 @@ void OpenCLNonbondedUtilities::prepareInteractions() { ...@@ -264,6 +277,8 @@ void OpenCLNonbondedUtilities::prepareInteractions() {
findInteractingBlocksKernel.setArg<mm_float4>(1, context.getPeriodicBoxSize()); findInteractingBlocksKernel.setArg<mm_float4>(1, context.getPeriodicBoxSize());
findInteractingBlocksKernel.setArg<mm_float4>(2, context.getInvPeriodicBoxSize()); findInteractingBlocksKernel.setArg<mm_float4>(2, context.getInvPeriodicBoxSize());
context.executeKernel(findInteractingBlocksKernel, context.getNumAtoms()); context.executeKernel(findInteractingBlocksKernel, context.getNumAtoms());
vector<cl_uint> count;
interactionCount->download(count);
if (context.getSIMDWidth() == 32) { if (context.getSIMDWidth() == 32) {
findInteractionsWithinBlocksKernel.setArg<mm_float4>(1, context.getPeriodicBoxSize()); findInteractionsWithinBlocksKernel.setArg<mm_float4>(1, context.getPeriodicBoxSize());
findInteractionsWithinBlocksKernel.setArg<mm_float4>(2, context.getInvPeriodicBoxSize()); findInteractionsWithinBlocksKernel.setArg<mm_float4>(2, context.getInvPeriodicBoxSize());
...@@ -274,8 +289,8 @@ void OpenCLNonbondedUtilities::prepareInteractions() { ...@@ -274,8 +289,8 @@ void OpenCLNonbondedUtilities::prepareInteractions() {
void OpenCLNonbondedUtilities::computeInteractions() { void OpenCLNonbondedUtilities::computeInteractions() {
if (cutoff != -1.0) { if (cutoff != -1.0) {
if (useCutoff) { if (useCutoff) {
forceKernel.setArg<mm_float4>(11, context.getPeriodicBoxSize()); forceKernel.setArg<mm_float4>(10, context.getPeriodicBoxSize());
forceKernel.setArg<mm_float4>(12, context.getInvPeriodicBoxSize()); forceKernel.setArg<mm_float4>(11, context.getInvPeriodicBoxSize());
} }
context.executeKernel(forceKernel, (context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2)*OpenCLContext::TileSize); context.executeKernel(forceKernel, (context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2)*OpenCLContext::TileSize);
} }
...@@ -388,6 +403,8 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc ...@@ -388,6 +403,8 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
defines["NUM_ATOMS"] = OpenCLExpressionUtilities::intToString(context.getNumAtoms()); defines["NUM_ATOMS"] = OpenCLExpressionUtilities::intToString(context.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = OpenCLExpressionUtilities::intToString(context.getPaddedNumAtoms()); defines["PADDED_NUM_ATOMS"] = OpenCLExpressionUtilities::intToString(context.getPaddedNumAtoms());
defines["NUM_BLOCKS"] = OpenCLExpressionUtilities::intToString(context.getNumAtomBlocks()); defines["NUM_BLOCKS"] = OpenCLExpressionUtilities::intToString(context.getNumAtomBlocks());
if (useCutoff)
defines["MAX_TILES"] = OpenCLExpressionUtilities::intToString(interactingTiles->getSize());
string file = (context.getSIMDWidth() == 32 ? OpenCLKernelSources::nonbonded_nvidia : OpenCLKernelSources::nonbonded_default); string file = (context.getSIMDWidth() == 32 ? OpenCLKernelSources::nonbonded_nvidia : OpenCLKernelSources::nonbonded_default);
cl::Program program = context.createProgram(context.replaceStrings(file, replacements), defines); cl::Program program = context.createProgram(context.replaceStrings(file, replacements), defines);
cl::Kernel kernel(program, "computeNonbonded"); cl::Kernel kernel(program, "computeNonbonded");
...@@ -405,9 +422,10 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc ...@@ -405,9 +422,10 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
kernel.setArg(index++, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL); kernel.setArg(index++, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL);
if (useCutoff) { if (useCutoff) {
kernel.setArg<cl::Buffer>(index++, interactingTiles->getDeviceBuffer()); kernel.setArg<cl::Buffer>(index++, interactingTiles->getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, interactionFlags->getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, interactionCount->getDeviceBuffer()); kernel.setArg<cl::Buffer>(index++, interactionCount->getDeviceBuffer());
index += 2; // The periodic box size arguments are set when the kernel is executed. index += 2; // The periodic box size arguments are set when the kernel is executed.
if (context.getSIMDWidth() == 32)
kernel.setArg<cl::Buffer>(index++, interactionFlags->getDeviceBuffer());
} }
else { else {
kernel.setArg<cl_uint>(index++, context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2); kernel.setArg<cl_uint>(index++, context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2);
......
...@@ -11,16 +11,19 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer ...@@ -11,16 +11,19 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions, __global unsigned int* exclusionIndices, __global float4* posq, __local float4* local_posq, __global unsigned int* exclusions, __global unsigned int* exclusionIndices,
__global unsigned int* exclusionRowIndices, __local float4* tempForceBuffer, __global unsigned int* exclusionRowIndices, __local float4* tempForceBuffer,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles
#else #else
unsigned int numTiles unsigned int numTiles
#endif #endif
PARAMETER_ARGUMENTS) { PARAMETER_ARGUMENTS) {
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0]; unsigned int numTiles = interactionCount[0];
#endif unsigned int pos = get_group_id(0)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
unsigned int end = (get_group_id(0)+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
#else
unsigned int pos = get_group_id(0)*numTiles/get_num_groups(0); unsigned int pos = get_group_id(0)*numTiles/get_num_groups(0);
unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0); unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
#endif
float energy = 0.0f; float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF; unsigned int lasty = 0xFFFFFFFF;
__local unsigned int exclusionRange[2]; __local unsigned int exclusionRange[2];
...@@ -29,18 +32,23 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer ...@@ -29,18 +32,23 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
while (pos < end) { while (pos < end) {
// Extract the coordinates of this tile // Extract the coordinates of this tile
unsigned int x, y;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
ushort2 tileIndices = tiles[pos]; if (numTiles <= maxTiles) {
unsigned int x = tileIndices.x; ushort2 tileIndices = tiles[pos];
unsigned int y = tileIndices.y; x = tileIndices.x;
#else y = tileIndices.y;
unsigned int y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
unsigned int x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y++;
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
} }
else
#endif #endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y++;
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2); unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2);
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int forceBufferOffset = (tgx < TILE_SIZE/2 ? 0 : TILE_SIZE); unsigned int forceBufferOffset = (tgx < TILE_SIZE/2 ? 0 : TILE_SIZE);
......
...@@ -11,18 +11,21 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer ...@@ -11,18 +11,21 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions, __global unsigned int* exclusionIndices, __global float4* posq, __local float4* local_posq, __global unsigned int* exclusions, __global unsigned int* exclusionIndices,
__global unsigned int* exclusionRowIndices, __local float4* tempBuffer, __global unsigned int* exclusionRowIndices, __local float4* tempBuffer,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags
#else #else
unsigned int numTiles unsigned int numTiles
#endif #endif
PARAMETER_ARGUMENTS) { PARAMETER_ARGUMENTS) {
#ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0];
#endif
unsigned int totalWarps = get_global_size(0)/TILE_SIZE; unsigned int totalWarps = get_global_size(0)/TILE_SIZE;
unsigned int warp = get_global_id(0)/TILE_SIZE; unsigned int warp = get_global_id(0)/TILE_SIZE;
#ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0];
unsigned int pos = warp*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
unsigned int end = (warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
#else
unsigned int pos = warp*numTiles/totalWarps; unsigned int pos = warp*numTiles/totalWarps;
unsigned int end = (warp+1)*numTiles/totalWarps; unsigned int end = (warp+1)*numTiles/totalWarps;
#endif
float energy = 0.0f; float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF; unsigned int lasty = 0xFFFFFFFF;
__local unsigned int exclusionRange[4]; __local unsigned int exclusionRange[4];
...@@ -30,18 +33,23 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer ...@@ -30,18 +33,23 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
while (pos < end) { while (pos < end) {
// Extract the coordinates of this tile // Extract the coordinates of this tile
unsigned int x, y;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
ushort2 tileIndices = tiles[pos]; if (numTiles <= maxTiles) {
unsigned int x = tileIndices.x; ushort2 tileIndices = tiles[pos];
unsigned int y = tileIndices.y; x = tileIndices.x;
#else y = tileIndices.y;
unsigned int y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
unsigned int x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y++;
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
} }
else
#endif #endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y++;
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int tbx = get_local_id(0) - tgx; unsigned int tbx = get_local_id(0) - tgx;
unsigned int atom1 = x*TILE_SIZE + tgx; unsigned int atom1 = x*TILE_SIZE + tgx;
...@@ -128,7 +136,7 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer ...@@ -128,7 +136,7 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
local_force[get_local_id(0)] = 0.0f; local_force[get_local_id(0)] = 0.0f;
CLEAR_LOCAL_DERIVATIVES CLEAR_LOCAL_DERIVATIVES
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int flags = interactionFlags[pos]; unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
if (!hasExclusions && flags == 0) { if (!hasExclusions && flags == 0) {
// No interactions in this tile. // No interactions in this tile.
} }
......
...@@ -9,16 +9,19 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global ...@@ -9,16 +9,19 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
__global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __global float* global_value, __local float* local_value, __global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __global float* global_value, __local float* local_value,
__local float* tempBuffer, __local float* tempBuffer,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles
#else #else
unsigned int numTiles unsigned int numTiles
#endif #endif
PARAMETER_ARGUMENTS) { PARAMETER_ARGUMENTS) {
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0]; unsigned int numTiles = interactionCount[0];
#endif unsigned int pos = get_group_id(0)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
unsigned int end = (get_group_id(0)+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
#else
unsigned int pos = get_group_id(0)*numTiles/get_num_groups(0); unsigned int pos = get_group_id(0)*numTiles/get_num_groups(0);
unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0); unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
#endif
float energy = 0.0f; float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF; unsigned int lasty = 0xFFFFFFFF;
__local unsigned int exclusionRange[2]; __local unsigned int exclusionRange[2];
...@@ -26,18 +29,23 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global ...@@ -26,18 +29,23 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
while (pos < end) { while (pos < end) {
// Extract the coordinates of this tile // Extract the coordinates of this tile
unsigned int x, y;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
ushort2 tileIndices = tiles[pos]; if (numTiles <= maxTiles) {
unsigned int x = tileIndices.x; ushort2 tileIndices = tiles[pos];
unsigned int y = tileIndices.y; x = tileIndices.x;
#else y = tileIndices.y;
unsigned int y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
unsigned int x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y++;
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
} }
else
#endif #endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y++;
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2); unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2);
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int valueBufferOffset = (tgx < TILE_SIZE/2 ? 0 : TILE_SIZE); unsigned int valueBufferOffset = (tgx < TILE_SIZE/2 ? 0 : TILE_SIZE);
......
...@@ -9,18 +9,21 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global ...@@ -9,18 +9,21 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
__global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __global float* global_value, __local float* local_value, __global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __global float* global_value, __local float* local_value,
__local float* tempBuffer, __local float* tempBuffer,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags
#else #else
unsigned int numTiles unsigned int numTiles
#endif #endif
PARAMETER_ARGUMENTS) { PARAMETER_ARGUMENTS) {
#ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0];
#endif
unsigned int totalWarps = get_global_size(0)/TILE_SIZE; unsigned int totalWarps = get_global_size(0)/TILE_SIZE;
unsigned int warp = get_global_id(0)/TILE_SIZE; unsigned int warp = get_global_id(0)/TILE_SIZE;
#ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0];
unsigned int pos = warp*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
unsigned int end = (warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
#else
unsigned int pos = warp*numTiles/totalWarps; unsigned int pos = warp*numTiles/totalWarps;
unsigned int end = (warp+1)*numTiles/totalWarps; unsigned int end = (warp+1)*numTiles/totalWarps;
#endif
float energy = 0.0f; float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF; unsigned int lasty = 0xFFFFFFFF;
__local unsigned int exclusionRange[4]; __local unsigned int exclusionRange[4];
...@@ -28,18 +31,23 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global ...@@ -28,18 +31,23 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
while (pos < end) { while (pos < end) {
// Extract the coordinates of this tile // Extract the coordinates of this tile
unsigned int x, y;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
ushort2 tileIndices = tiles[pos]; if (numTiles <= maxTiles) {
unsigned int x = tileIndices.x; ushort2 tileIndices = tiles[pos];
unsigned int y = tileIndices.y; x = tileIndices.x;
#else y = tileIndices.y;
unsigned int y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
unsigned int x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y++;
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
} }
else
#endif #endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y++;
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int tbx = get_local_id(0) - tgx; unsigned int tbx = get_local_id(0) - tgx;
unsigned int atom1 = x*TILE_SIZE + tgx; unsigned int atom1 = x*TILE_SIZE + tgx;
...@@ -125,7 +133,7 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global ...@@ -125,7 +133,7 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
} }
local_value[get_local_id(0)] = 0.0f; local_value[get_local_id(0)] = 0.0f;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int flags = interactionFlags[pos]; unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
if (!hasExclusions && flags != 0xFFFFFFFF) { if (!hasExclusions && flags != 0xFFFFFFFF) {
if (flags == 0) { if (flags == 0) {
// No interactions in this tile. // No interactions in this tile.
......
...@@ -147,8 +147,9 @@ void storeInteractionData(__local ushort2* buffer, __local int* valid, __local s ...@@ -147,8 +147,9 @@ void storeInteractionData(__local ushort2* buffer, __local int* valid, __local s
if (get_local_id(0) == 0) if (get_local_id(0) == 0)
*baseIndex = atom_add(interactionCount, numValid); *baseIndex = atom_add(interactionCount, numValid);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
for (int i = get_local_id(0); i < numValid; i += GROUP_SIZE) if (*baseIndex+numValid <= MAX_TILES)
interactingTiles[*baseIndex+i] = temp[i]; for (int i = get_local_id(0); i < numValid; i += GROUP_SIZE)
interactingTiles[*baseIndex+i] = temp[i];
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
} }
...@@ -232,6 +233,8 @@ __kernel void findInteractionsWithinBlocks(float cutoffSquared, float4 periodicB ...@@ -232,6 +233,8 @@ __kernel void findInteractionsWithinBlocks(float cutoffSquared, float4 periodicB
unsigned int end = (warp+1)*numTiles/totalWarps; unsigned int end = (warp+1)*numTiles/totalWarps;
unsigned int index = get_local_id(0) & (TILE_SIZE - 1); unsigned int index = get_local_id(0) & (TILE_SIZE - 1);
if (numTiles > MAX_TILES)
return;
unsigned int lasty = 0xFFFFFFFF; unsigned int lasty = 0xFFFFFFFF;
float4 apos; float4 apos;
while (pos < end) { while (pos < end) {
......
...@@ -17,32 +17,40 @@ typedef struct { ...@@ -17,32 +17,40 @@ typedef struct {
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1))) __kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeBornSum(__global float* global_bornSum, __global float4* posq, __global float2* global_params, __local AtomData* localData, __local float* tempBuffer, void computeBornSum(__global float* global_bornSum, __global float4* posq, __global float2* global_params, __local AtomData* localData, __local float* tempBuffer,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize) { __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
#else #else
unsigned int numTiles) { unsigned int numTiles) {
#endif #endif
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0]; unsigned int numTiles = interactionCount[0];
#endif unsigned int pos = get_group_id(0)*(numTiles > MAX_TILES ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
unsigned int end = (get_group_id(0)+1)*(numTiles > MAX_TILES ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
#else
unsigned int pos = get_group_id(0)*numTiles/get_num_groups(0); unsigned int pos = get_group_id(0)*numTiles/get_num_groups(0);
unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0); unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
#endif
float energy = 0.0f; float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF; unsigned int lasty = 0xFFFFFFFF;
while (pos < end) { while (pos < end) {
// Extract the coordinates of this tile // Extract the coordinates of this tile
unsigned int x, y;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
ushort2 tileIndices = tiles[pos]; if (numTiles <= MAX_TILES) {
unsigned int x = tileIndices.x; ushort2 tileIndices = tiles[pos];
unsigned int y = tileIndices.y; x = tileIndices.x;
#else y = tileIndices.y;
unsigned int y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
unsigned int x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y++;
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
} }
else
#endif #endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y++;
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2); unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2);
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int forceBufferOffset = (tgx < TILE_SIZE/2 ? 0 : TILE_SIZE); unsigned int forceBufferOffset = (tgx < TILE_SIZE/2 ? 0 : TILE_SIZE);
...@@ -196,32 +204,40 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff ...@@ -196,32 +204,40 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
__global float4* posq, __global float* global_bornRadii, __global float4* posq, __global float* global_bornRadii,
__global float* global_bornForce, __local AtomData* localData, __local float4* tempBuffer, __global float* global_bornForce, __local AtomData* localData, __local float4* tempBuffer,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize) { __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
#else #else
unsigned int numTiles) { unsigned int numTiles) {
#endif #endif
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0]; unsigned int numTiles = interactionCount[0];
#endif unsigned int pos = get_group_id(0)*(numTiles > MAX_TILES ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
unsigned int end = (get_group_id(0)+1)*(numTiles > MAX_TILES ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
#else
unsigned int pos = get_group_id(0)*numTiles/get_num_groups(0); unsigned int pos = get_group_id(0)*numTiles/get_num_groups(0);
unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0); unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
#endif
float energy = 0.0f; float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF; unsigned int lasty = 0xFFFFFFFF;
while (pos < end) { while (pos < end) {
// Extract the coordinates of this tile // Extract the coordinates of this tile
unsigned int x, y;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
ushort2 tileIndices = tiles[pos]; if (numTiles <= MAX_TILES) {
unsigned int x = tileIndices.x; ushort2 tileIndices = tiles[pos];
unsigned int y = tileIndices.y; x = tileIndices.x;
#else y = tileIndices.y;
unsigned int y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
unsigned int x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y++;
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
} }
else
#endif #endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y++;
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2); unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2);
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int forceBufferOffset = (tgx < TILE_SIZE/2 ? 0 : TILE_SIZE); unsigned int forceBufferOffset = (tgx < TILE_SIZE/2 ? 0 : TILE_SIZE);
......
...@@ -17,34 +17,42 @@ typedef struct { ...@@ -17,34 +17,42 @@ typedef struct {
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1))) __kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeBornSum(__global float* global_bornSum, __global float4* posq, __global float2* global_params, __local AtomData* localData, __local float* tempBuffer, void computeBornSum(__global float* global_bornSum, __global float4* posq, __global float2* global_params, __local AtomData* localData, __local float* tempBuffer,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize) { __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global unsigned int* interactionFlags) {
#else #else
unsigned int numTiles) { unsigned int numTiles) {
#endif
#ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0];
#endif #endif
unsigned int totalWarps = get_global_size(0)/TILE_SIZE; unsigned int totalWarps = get_global_size(0)/TILE_SIZE;
unsigned int warp = get_global_id(0)/TILE_SIZE; unsigned int warp = get_global_id(0)/TILE_SIZE;
#ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0];
unsigned int pos = warp*(numTiles > MAX_TILES ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
unsigned int end = (warp+1)*(numTiles > MAX_TILES ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
#else
unsigned int pos = warp*numTiles/totalWarps; unsigned int pos = warp*numTiles/totalWarps;
unsigned int end = (warp+1)*numTiles/totalWarps; unsigned int end = (warp+1)*numTiles/totalWarps;
#endif
float energy = 0.0f; float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF; unsigned int lasty = 0xFFFFFFFF;
while (pos < end) { while (pos < end) {
// Extract the coordinates of this tile // Extract the coordinates of this tile
unsigned int x, y;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
ushort2 tileIndices = tiles[pos]; if (numTiles <= MAX_TILES) {
unsigned int x = tileIndices.x; ushort2 tileIndices = tiles[pos];
unsigned int y = tileIndices.y; x = tileIndices.x;
#else y = tileIndices.y;
unsigned int y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
unsigned int x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y++;
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
} }
else
#endif #endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y++;
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int tbx = get_local_id(0) - tgx; unsigned int tbx = get_local_id(0) - tgx;
unsigned int atom1 = x*TILE_SIZE + tgx; unsigned int atom1 = x*TILE_SIZE + tgx;
...@@ -115,7 +123,7 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo ...@@ -115,7 +123,7 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
} }
localData[get_local_id(0)].bornSum = 0.0f; localData[get_local_id(0)].bornSum = 0.0f;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int flags = interactionFlags[pos]; unsigned int flags = (numTiles <= MAX_TILES ? interactionFlags[pos] : 0xFFFFFFFF);
if (flags != 0xFFFFFFFF && false) { // TODO: Fix this: should be checking for exclusions if (flags != 0xFFFFFFFF && false) { // TODO: Fix this: should be checking for exclusions
if (flags == 0) { if (flags == 0) {
// No interactions in this tile. // No interactions in this tile.
...@@ -262,34 +270,42 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff ...@@ -262,34 +270,42 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
__global float4* posq, __global float* global_bornRadii, __global float4* posq, __global float* global_bornRadii,
__global float* global_bornForce, __local AtomData* localData, __local float4* tempBuffer, __global float* global_bornForce, __local AtomData* localData, __local float4* tempBuffer,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize) { __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global unsigned int* interactionFlags) {
#else #else
unsigned int numTiles) { unsigned int numTiles) {
#endif
#ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0];
#endif #endif
unsigned int totalWarps = get_global_size(0)/TILE_SIZE; unsigned int totalWarps = get_global_size(0)/TILE_SIZE;
unsigned int warp = get_global_id(0)/TILE_SIZE; unsigned int warp = get_global_id(0)/TILE_SIZE;
#ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0];
unsigned int pos = warp*(numTiles > MAX_TILES ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
unsigned int end = (warp+1)*(numTiles > MAX_TILES ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
#else
unsigned int pos = warp*numTiles/totalWarps; unsigned int pos = warp*numTiles/totalWarps;
unsigned int end = (warp+1)*numTiles/totalWarps; unsigned int end = (warp+1)*numTiles/totalWarps;
#endif
float energy = 0.0f; float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF; unsigned int lasty = 0xFFFFFFFF;
while (pos < end) { while (pos < end) {
// Extract the coordinates of this tile // Extract the coordinates of this tile
unsigned int x, y;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
ushort2 tileIndices = tiles[pos]; if (numTiles <= MAX_TILES) {
unsigned int x = tileIndices.x; ushort2 tileIndices = tiles[pos];
unsigned int y = tileIndices.y; x = tileIndices.x;
#else y = tileIndices.y;
unsigned int y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
unsigned int x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y++;
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
} }
else
#endif #endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y++;
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int tbx = get_local_id(0) - tgx; unsigned int tbx = get_local_id(0) - tgx;
unsigned int atom1 = x*TILE_SIZE + tgx; unsigned int atom1 = x*TILE_SIZE + tgx;
...@@ -365,7 +381,7 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff ...@@ -365,7 +381,7 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
localData[get_local_id(0)].fz = 0.0f; localData[get_local_id(0)].fz = 0.0f;
localData[get_local_id(0)].fw = 0.0f; localData[get_local_id(0)].fw = 0.0f;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int flags = interactionFlags[pos]; unsigned int flags = (numTiles <= MAX_TILES ? interactionFlags[pos] : 0xFFFFFFFF);
if (flags != 0xFFFFFFFF && false) { // TODO: Fix this: should be checking for exclusions if (flags != 0xFFFFFFFF && false) { // TODO: Fix this: should be checking for exclusions
if (flags == 0) { if (flags == 0) {
// No interactions in this tile. // No interactions in this tile.
......
...@@ -15,16 +15,19 @@ __kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1))) ...@@ -15,16 +15,19 @@ __kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions, void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions,
__global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __local AtomData* localData, __local float4* tempBuffer, __global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __local AtomData* localData, __local float4* tempBuffer,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize
#else #else
unsigned int numTiles unsigned int numTiles
#endif #endif
PARAMETER_ARGUMENTS) { PARAMETER_ARGUMENTS) {
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0]; unsigned int numTiles = interactionCount[0];
#endif unsigned int pos = get_group_id(0)*(numTiles > MAX_TILES ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
unsigned int end = (get_group_id(0)+1)*(numTiles > MAX_TILES ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
#else
unsigned int pos = get_group_id(0)*numTiles/get_num_groups(0); unsigned int pos = get_group_id(0)*numTiles/get_num_groups(0);
unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0); unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
#endif
float energy = 0.0f; float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF; unsigned int lasty = 0xFFFFFFFF;
__local unsigned int exclusionRange[2]; __local unsigned int exclusionRange[2];
...@@ -32,18 +35,23 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe ...@@ -32,18 +35,23 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
while (pos < end) { while (pos < end) {
// Extract the coordinates of this tile // Extract the coordinates of this tile
unsigned int x, y;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
ushort2 tileIndices = tiles[pos]; if (numTiles <= MAX_TILES) {
unsigned int x = tileIndices.x; ushort2 tileIndices = tiles[pos];
unsigned int y = tileIndices.y; x = tileIndices.x;
#else y = tileIndices.y;
unsigned int y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
unsigned int x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y++;
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
} }
else
#endif #endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y++;
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2); unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2);
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int forceBufferOffset = (tgx < TILE_SIZE/2 ? 0 : TILE_SIZE); unsigned int forceBufferOffset = (tgx < TILE_SIZE/2 ? 0 : TILE_SIZE);
......
...@@ -15,18 +15,21 @@ __kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1))) ...@@ -15,18 +15,21 @@ __kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions, void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions,
__global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __local AtomData* localData, __local float4* tempBuffer, __global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __local AtomData* localData, __local float4* tempBuffer,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global unsigned int* interactionFlags
#else #else
unsigned int numTiles unsigned int numTiles
#endif #endif
PARAMETER_ARGUMENTS) { PARAMETER_ARGUMENTS) {
#ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0];
#endif
unsigned int totalWarps = get_global_size(0)/TILE_SIZE; unsigned int totalWarps = get_global_size(0)/TILE_SIZE;
unsigned int warp = get_global_id(0)/TILE_SIZE; unsigned int warp = get_global_id(0)/TILE_SIZE;
#ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0];
unsigned int pos = warp*(numTiles > MAX_TILES ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
unsigned int end = (warp+1)*(numTiles > MAX_TILES ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
#else
unsigned int pos = warp*numTiles/totalWarps; unsigned int pos = warp*numTiles/totalWarps;
unsigned int end = (warp+1)*numTiles/totalWarps; unsigned int end = (warp+1)*numTiles/totalWarps;
#endif
float energy = 0.0f; float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF; unsigned int lasty = 0xFFFFFFFF;
__local unsigned int exclusionRange[4]; __local unsigned int exclusionRange[4];
...@@ -34,18 +37,23 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe ...@@ -34,18 +37,23 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
while (pos < end) { while (pos < end) {
// Extract the coordinates of this tile // Extract the coordinates of this tile
unsigned int x, y;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
ushort2 tileIndices = tiles[pos]; if (numTiles <= MAX_TILES) {
unsigned int x = tileIndices.x; ushort2 tileIndices = tiles[pos];
unsigned int y = tileIndices.y; x = tileIndices.x;
#else y = tileIndices.y;
unsigned int y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
unsigned int x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y++;
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
} }
else
#endif #endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y++;
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int tbx = get_local_id(0) - tgx; unsigned int tbx = get_local_id(0) - tgx;
unsigned int atom1 = x*TILE_SIZE + tgx; unsigned int atom1 = x*TILE_SIZE + tgx;
...@@ -137,7 +145,7 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe ...@@ -137,7 +145,7 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
localData[get_local_id(0)].fy = 0.0f; localData[get_local_id(0)].fy = 0.0f;
localData[get_local_id(0)].fz = 0.0f; localData[get_local_id(0)].fz = 0.0f;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int flags = interactionFlags[pos]; unsigned int flags = (numTiles <= MAX_TILES ? interactionFlags[pos] : 0xFFFFFFFF);
if (!hasExclusions && flags != 0xFFFFFFFF) { if (!hasExclusions && flags != 0xFFFFFFFF) {
if (flags == 0) { if (flags == 0) {
// No interactions in this tile. // No interactions in this tile.
......
...@@ -523,7 +523,8 @@ void testBlockInteractions(bool periodic) { ...@@ -523,7 +523,8 @@ void testBlockInteractions(bool periodic) {
int numWithInteractions = interactionCount[0]; int numWithInteractions = interactionCount[0];
vector<bool> hasInteractions(numBlocks*(numBlocks+1)/2, false); vector<bool> hasInteractions(numBlocks*(numBlocks+1)/2, false);
nb.getInteractingTiles().download(interactingTiles); nb.getInteractingTiles().download(interactingTiles);
nb.getInteractionFlags().download(interactionFlags); if (clcontext.getSIMDWidth() == 32)
nb.getInteractionFlags().download(interactionFlags);
const unsigned int atoms = clcontext.getPaddedNumAtoms(); const unsigned int atoms = clcontext.getPaddedNumAtoms();
const unsigned int grid = OpenCLContext::TileSize; const unsigned int grid = OpenCLContext::TileSize;
const unsigned int dim = clcontext.getNumAtomBlocks(); const unsigned int dim = clcontext.getNumAtomBlocks();
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment