Commit 22078b69 authored by Peter Eastman's avatar Peter Eastman
Browse files

Continuing changes to reduce memory use for large systems

parent 0f8da117
......@@ -1705,10 +1705,8 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getInteractionFlags().getDeviceBuffer());
computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
}
else {
computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getTiles().getDeviceBuffer());
computeBornSumKernel.setArg<cl_uint>(index++, nb.getTiles().getSize());
}
else
computeBornSumKernel.setArg<cl_uint>(index++, cl.getNumAtomBlocks()*(cl.getNumAtomBlocks()+1)/2);
force1Kernel = cl::Kernel(program, "computeGBSAForce1");
index = 0;
force1Kernel.setArg<cl::Buffer>(index++, cl.getForceBuffers().getDeviceBuffer());
......@@ -1723,10 +1721,8 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
force1Kernel.setArg<cl::Buffer>(index++, nb.getInteractionFlags().getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
}
else {
force1Kernel.setArg<cl::Buffer>(index++, nb.getTiles().getDeviceBuffer());
force1Kernel.setArg<cl_uint>(index++, nb.getTiles().getSize());
}
else
force1Kernel.setArg<cl_uint>(index++, cl.getNumAtomBlocks()*(cl.getNumAtomBlocks()+1)/2);
program = cl.createProgram(OpenCLKernelSources::gbsaObcReductions, defines);
reduceBornSumKernel = cl::Kernel(program, "reduceBornSum");
reduceBornSumKernel.setArg<cl_int>(0, cl.getPaddedNumAtoms());
......@@ -1753,9 +1749,10 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
force1Kernel.setArg<mm_float4>(10, cl.getPeriodicBoxSize());
force1Kernel.setArg<mm_float4>(11, cl.getInvPeriodicBoxSize());
}
cl.executeKernel(computeBornSumKernel, nb.getTiles().getSize()*OpenCLContext::TileSize);
int numTiles = cl.getNumAtomBlocks()*(cl.getNumAtomBlocks()+1)/2;
cl.executeKernel(computeBornSumKernel, numTiles*OpenCLContext::TileSize);
cl.executeKernel(reduceBornSumKernel, cl.getPaddedNumAtoms());
cl.executeKernel(force1Kernel, nb.getTiles().getSize()*OpenCLContext::TileSize);
cl.executeKernel(force1Kernel, numTiles*OpenCLContext::TileSize);
cl.executeKernel(reduceBornForceKernel, cl.getPaddedNumAtoms());
return 0.0;
}
......@@ -2406,10 +2403,8 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
pairValueKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
index += 2; // Periodic box size arguments are set when the kernel is executed.
}
else {
pairValueKernel.setArg<cl::Buffer>(index++, nb.getTiles().getDeviceBuffer());
pairValueKernel.setArg<cl_uint>(index++, nb.getTiles().getSize());
}
else
pairValueKernel.setArg<cl_uint>(index++, cl.getNumAtomBlocks()*(cl.getNumAtomBlocks()+1)/2);
if (globals != NULL)
pairValueKernel.setArg<cl::Buffer>(index++, globals->getDeviceBuffer());
for (int i = 0; i < (int) params->getBuffers().size(); i++) {
......@@ -2454,10 +2449,8 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
pairEnergyKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
index += 2; // Periodic box size arguments are set when the kernel is executed.
}
else {
pairEnergyKernel.setArg<cl::Buffer>(index++, nb.getTiles().getDeviceBuffer());
pairEnergyKernel.setArg<cl_uint>(index++, nb.getTiles().getSize());
}
else
pairEnergyKernel.setArg<cl_uint>(index++, cl.getNumAtomBlocks()*(cl.getNumAtomBlocks()+1)/2);
if (globals != NULL)
pairEnergyKernel.setArg<cl::Buffer>(index++, globals->getDeviceBuffer());
for (int i = 0; i < (int) params->getBuffers().size(); i++) {
......@@ -2530,9 +2523,10 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
pairEnergyKernel.setArg<mm_float4>(12, cl.getPeriodicBoxSize());
pairEnergyKernel.setArg<mm_float4>(13, cl.getInvPeriodicBoxSize());
}
cl.executeKernel(pairValueKernel, nb.getTiles().getSize()*OpenCLContext::TileSize);
int numTiles = cl.getNumAtomBlocks()*(cl.getNumAtomBlocks()+1)/2;
cl.executeKernel(pairValueKernel, numTiles*OpenCLContext::TileSize);
cl.executeKernel(perParticleValueKernel, cl.getPaddedNumAtoms());
cl.executeKernel(pairEnergyKernel, nb.getTiles().getSize()*OpenCLContext::TileSize);
cl.executeKernel(pairEnergyKernel, numTiles*OpenCLContext::TileSize);
cl.executeKernel(perParticleEnergyKernel, cl.getPaddedNumAtoms());
if (needParameterGradient)
cl.executeKernel(gradientChainRuleKernel, cl.getPaddedNumAtoms());
......
......@@ -26,7 +26,6 @@
#include "OpenCLNonbondedUtilities.h"
#include "OpenCLArray.h"
#include "OpenCLCompact.h"
#include "OpenCLKernelSources.h"
#include "OpenCLExpressionUtilities.h"
#include <map>
......@@ -37,8 +36,8 @@ using namespace OpenMM;
using namespace std;
OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : context(context), cutoff(-1.0), useCutoff(false),
numForceBuffers(0), tiles(NULL), exclusionIndices(NULL), exclusionRowIndices(NULL), exclusions(NULL), interactingTiles(NULL), interactionFlags(NULL),
interactionCount(NULL), blockCenter(NULL), blockBoundingBox(NULL), compact(NULL) {
numForceBuffers(0), exclusionIndices(NULL), exclusionRowIndices(NULL), exclusions(NULL), interactingTiles(NULL), interactionFlags(NULL),
interactionCount(NULL), blockCenter(NULL), blockBoundingBox(NULL) {
// Decide how many force buffers to use.
forceBufferPerAtomBlock = false;
......@@ -52,8 +51,6 @@ OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : con
}
OpenCLNonbondedUtilities::~OpenCLNonbondedUtilities() {
if (tiles != NULL)
delete tiles;
if (exclusionIndices != NULL)
delete exclusionIndices;
if (exclusionRowIndices != NULL)
......@@ -70,8 +67,6 @@ OpenCLNonbondedUtilities::~OpenCLNonbondedUtilities() {
delete blockCenter;
if (blockBoundingBox != NULL)
delete blockBoundingBox;
if (compact != NULL)
delete compact;
}
void OpenCLNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel) {
......@@ -127,33 +122,6 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
int numAtomBlocks = context.getNumAtomBlocks();
int numTiles = numAtomBlocks*(numAtomBlocks+1)/2;
tiles = new OpenCLArray<cl_uint>(context, numTiles, "tiles");
vector<cl_uint> tileVec(tiles->getSize());
unsigned int count = 0;
for (unsigned int y = 0; y < (unsigned int) numAtomBlocks; y++)
for (unsigned int x = y; x < (unsigned int) numAtomBlocks; x++)
tileVec[count++] = (x << 17) | (y << 2);
// Mark which tiles have exclusions.
for (int atom1 = 0; atom1 < (int) atomExclusions.size(); ++atom1) {
int x = atom1/OpenCLContext::TileSize;
for (int j = 0; j < (int) atomExclusions[atom1].size(); ++j) {
int atom2 = atomExclusions[atom1][j];
int y = atom2/OpenCLContext::TileSize;
int index = (x > y ? x+y*numAtomBlocks-y*(y+1)/2 : y+x*numAtomBlocks-x*(x+1)/2);
tileVec[index] |= 1;
}
}
if (context.getPaddedNumAtoms() > context.getNumAtoms()) {
int lastTile = context.getNumAtoms()/OpenCLContext::TileSize;
for (int i = 0; i < numTiles; ++i) {
int x = tileVec[i]>>17;
int y = (tileVec[i]>>2)&0x7FFF;
if (x == lastTile || y == lastTile)
tileVec[i] |= 1;
}
}
// Build a list of indices for the tiles with exclusions.
......@@ -227,7 +195,6 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
}
}
atomExclusions.clear(); // We won't use this again, so free the memory it used
tiles->upload(tileVec);
exclusions->upload(exclusionVec);
// Create data structures for the neighbor list.
......@@ -238,7 +205,6 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
interactionCount = new OpenCLArray<cl_uint>(context, 1, "interactionCount");
blockCenter = new OpenCLArray<mm_float4>(context, numAtomBlocks, "blockCenter");
blockBoundingBox = new OpenCLArray<mm_float4>(context, numAtomBlocks, "blockBoundingBox");
compact = new OpenCLCompact(context);
}
// Create kernels.
......@@ -246,6 +212,7 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
forceKernel = createInteractionKernel(kernelSource, parameters, arguments, true, true);
if (useCutoff) {
map<string, string> defines;
defines["NUM_BLOCKS"] = OpenCLExpressionUtilities::intToString(context.getNumAtomBlocks());
if (forceBufferPerAtomBlock)
defines["USE_OUTPUT_BUFFER_PER_BLOCK"] = "1";
if (usePeriodic)
......@@ -256,13 +223,13 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
findBlockBoundsKernel.setArg<cl::Buffer>(3, context.getPosq().getDeviceBuffer());
findBlockBoundsKernel.setArg<cl::Buffer>(4, blockCenter->getDeviceBuffer());
findBlockBoundsKernel.setArg<cl::Buffer>(5, blockBoundingBox->getDeviceBuffer());
findBlockBoundsKernel.setArg<cl::Buffer>(6, interactionCount->getDeviceBuffer());
findInteractingBlocksKernel = cl::Kernel(interactingBlocksProgram, "findBlocksWithInteractions");
findInteractingBlocksKernel.setArg<cl_int>(0, tiles->getSize());
findInteractingBlocksKernel.setArg<cl_float>(1, (cl_float) (cutoff*cutoff));
findInteractingBlocksKernel.setArg<cl::Buffer>(4, tiles->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(5, blockCenter->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(6, blockBoundingBox->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(7, interactionFlags->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl_float>(0, (cl_float) (cutoff*cutoff));
findInteractingBlocksKernel.setArg<cl::Buffer>(3, blockCenter->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(4, blockBoundingBox->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(5, interactionCount->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(6, interactingTiles->getDeviceBuffer());
findInteractionsWithinBlocksKernel = cl::Kernel(interactingBlocksProgram, "findInteractionsWithinBlocks");
findInteractionsWithinBlocksKernel.setArg<cl_float>(0, (cl_float) (cutoff*cutoff));
findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(3, context.getPosq().getDeviceBuffer());
......@@ -293,10 +260,9 @@ void OpenCLNonbondedUtilities::prepareInteractions() {
findBlockBoundsKernel.setArg<mm_float4>(1, context.getPeriodicBoxSize());
findBlockBoundsKernel.setArg<mm_float4>(2, context.getInvPeriodicBoxSize());
context.executeKernel(findBlockBoundsKernel, context.getNumAtoms());
findInteractingBlocksKernel.setArg<mm_float4>(2, context.getPeriodicBoxSize());
findInteractingBlocksKernel.setArg<mm_float4>(3, context.getInvPeriodicBoxSize());
findInteractingBlocksKernel.setArg<mm_float4>(1, context.getPeriodicBoxSize());
findInteractingBlocksKernel.setArg<mm_float4>(2, context.getInvPeriodicBoxSize());
context.executeKernel(findInteractingBlocksKernel, context.getNumAtoms());
compact->compactStream(*interactingTiles, *tiles, *interactionFlags, *interactionCount);
if (context.getSIMDWidth() == 32) {
findInteractionsWithinBlocksKernel.setArg<mm_float4>(1, context.getPeriodicBoxSize());
findInteractionsWithinBlocksKernel.setArg<mm_float4>(2, context.getInvPeriodicBoxSize());
......@@ -305,12 +271,12 @@ void OpenCLNonbondedUtilities::prepareInteractions() {
}
void OpenCLNonbondedUtilities::computeInteractions() {
if (tiles != NULL) {
if (cutoff != -1.0) {
if (useCutoff) {
forceKernel.setArg<mm_float4>(11, context.getPeriodicBoxSize());
forceKernel.setArg<mm_float4>(12, context.getInvPeriodicBoxSize());
}
context.executeKernel(forceKernel, tiles->getSize()*OpenCLContext::TileSize);
context.executeKernel(forceKernel, (context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2)*OpenCLContext::TileSize);
}
}
......@@ -443,8 +409,7 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
index += 2; // The periodic box size arguments are set when the kernel is executed.
}
else {
kernel.setArg<cl::Buffer>(index++, tiles->getDeviceBuffer());
kernel.setArg<cl_uint>(index++, tiles->getSize());
kernel.setArg<cl_uint>(index++, context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2);
}
for (int i = 0; i < (int) params.size(); i++) {
kernel.setArg<cl::Memory>(index++, params[i].getMemory());
......
......@@ -35,8 +35,6 @@
namespace OpenMM {
class OpenCLCompact;
/**
* This class provides a generic interface for calculating nonbonded interactions. It does this in two
* ways. First, it can be used to create Kernels that evaluate nonbonded interactions. Clients
......@@ -141,12 +139,6 @@ public:
OpenCLArray<mm_float4>& getBlockBoundingBoxes() {
return *blockBoundingBox;
}
/**
* Get the array containing the full set of tiles.
*/
OpenCLArray<cl_uint>& getTiles() {
return *tiles;
}
/**
* Get the array whose first element contains the number of tiles with interactions.
*/
......@@ -202,7 +194,6 @@ private:
cl::Kernel findBlockBoundsKernel;
cl::Kernel findInteractingBlocksKernel;
cl::Kernel findInteractionsWithinBlocksKernel;
OpenCLArray<cl_uint>* tiles;
OpenCLArray<cl_uint>* exclusions;
OpenCLArray<cl_uint>* exclusionIndices;
OpenCLArray<cl_uint>* exclusionRowIndices;
......@@ -214,7 +205,6 @@ private:
std::vector<std::vector<int> > atomExclusions;
std::vector<ParameterInfo> parameters;
std::vector<ParameterInfo> arguments;
OpenCLCompact* compact;
std::string kernelSource;
std::map<std::string, std::string> kernelDefines;
double cutoff;
......
......@@ -9,9 +9,9 @@
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer, __local float4* local_force,
__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions, __global unsigned int* exclusionIndices,
__global unsigned int* exclusionRowIndices, __local float4* tempForceBuffer, __global unsigned int* tiles,
__global unsigned int* exclusionRowIndices, __local float4* tempForceBuffer,
#ifdef USE_CUTOFF
__global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize
__global unsigned int* tiles, __global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize
#else
unsigned int numTiles
#endif
......
......@@ -9,9 +9,9 @@
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer, __local float4* local_force,
__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions, __global unsigned int* exclusionIndices,
__global unsigned int* exclusionRowIndices, __local float4* tempBuffer, __global unsigned int* tiles,
__global unsigned int* exclusionRowIndices, __local float4* tempBuffer,
#ifdef USE_CUTOFF
__global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize
__global unsigned int* tiles, __global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize
#else
unsigned int numTiles
#endif
......
......@@ -7,9 +7,9 @@
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeN2Value(__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions,
__global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __global float* global_value, __local float* local_value,
__local float* tempBuffer, __global unsigned int* tiles,
__local float* tempBuffer,
#ifdef USE_CUTOFF
__global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize
__global unsigned int* tiles, __global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize
#else
unsigned int numTiles
#endif
......
......@@ -7,9 +7,9 @@
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeN2Value(__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions,
__global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __global float* global_value, __local float* local_value,
__local float* tempBuffer, __global unsigned int* tiles,
__local float* tempBuffer,
#ifdef USE_CUTOFF
__global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize
__global unsigned int* tiles, __global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize
#else
unsigned int numTiles
#endif
......
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#define TILE_SIZE 32
#define GROUP_SIZE 64
#define BUFFER_GROUPS 4
#define BUFFER_SIZE BUFFER_GROUPS*GROUP_SIZE
/**
* Find a bounding box for the atoms in each block.
*/
__kernel void findBlockBounds(int numAtoms, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global float4* posq, __global float4* blockCenter, __global float4* blockBoundingBox) {
__kernel void findBlockBounds(int numAtoms, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global float4* posq, __global float4* blockCenter, __global float4* blockBoundingBox, __global unsigned int* interactionCount) {
int index = get_global_id(0);
int base = index*TILE_SIZE;
while (base < numAtoms) {
......@@ -32,21 +36,89 @@ __kernel void findBlockBounds(int numAtoms, float4 periodicBoxSize, float4 invPe
index += get_global_size(0);
base = index*TILE_SIZE;
}
if (get_global_id(0) == 0)
interactionCount[0] = 0;
}
/**
* This is called by findBlocksWithInteractions(). It compacts the list of blocks and writes them
* to global memory.
*/
void storeInteractionData(__local short2* buffer, __local bool* valid, __local int* sum, __local int* sum2, __local short2* temp, __local int* baseIndex,
__global unsigned int* interactionCount, __global unsigned int* interactingTiles) {
// The buffer is full, so we need to compact it and write out results. Start by doing a parallel prefix sum.
for (int i = get_local_id(0); i < BUFFER_SIZE; i += GROUP_SIZE)
sum[i] = (valid[i] ? 1 : 0);
barrier(CLK_LOCAL_MEM_FENCE);
int whichBuffer = 0;
for (int offset = 1; offset < BUFFER_SIZE; offset *= 2) {
if (whichBuffer == 0)
for (int i = get_local_id(0); i < BUFFER_SIZE; i += GROUP_SIZE)
sum2[i] = (i < offset ? sum[i] : sum[i]+sum[i-offset]);
else
for (int i = get_local_id(0); i < BUFFER_SIZE; i += GROUP_SIZE)
sum[i] = (i < offset ? sum2[i] : sum2[i]+sum2[i-offset]);
whichBuffer = 1-whichBuffer;
barrier(CLK_LOCAL_MEM_FENCE);
}
if (whichBuffer == 1) {
for (int i = get_local_id(0); i < BUFFER_SIZE; i += GROUP_SIZE)
sum[i] = sum2[i];
barrier(CLK_LOCAL_MEM_FENCE);
}
// Compact the buffer and store it to global memory.
for (int i = get_local_id(0); i < BUFFER_SIZE; i += GROUP_SIZE)
if (valid[i]) {
temp[sum[i]-1] = buffer[i];
valid[i] = false;
}
barrier(CLK_LOCAL_MEM_FENCE);
int numValid = sum[BUFFER_SIZE-1];
if (get_local_id(0) == 0)
*baseIndex = atom_add(interactionCount, numValid);
barrier(CLK_LOCAL_MEM_FENCE);
// Store it to global memory.
for (int i = get_local_id(0); i < numValid; i += GROUP_SIZE)
interactingTiles[*baseIndex+i] = (temp[i].x<<17)+(temp[i].y<<2);
barrier(CLK_LOCAL_MEM_FENCE);
}
/**
* Compare the bounding boxes for each pair of blocks. If they are sufficiently far apart,
* mark them as non-interacting.
*/
__kernel void findBlocksWithInteractions(int numTiles, float cutoffSquared, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global unsigned int* tiles, __global float4* blockCenter,
__global float4* blockBoundingBox, __global unsigned int* interactionFlag) {
int index = get_global_id(0);
while (index < numTiles) {
// Extract cell coordinates from appropriate work unit
__kernel void findBlocksWithInteractions(float cutoffSquared, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global float4* blockCenter,
__global float4* blockBoundingBox, __global unsigned int* interactionCount, __global unsigned int* interactingTiles) {
__local short2 buffer[BUFFER_SIZE];
__local bool valid[BUFFER_SIZE];
__local int sum[BUFFER_SIZE];
__local int sum2[BUFFER_SIZE];
__local short2 temp[BUFFER_SIZE];
__local int bufferFull;
__local int globalIndex;
int valuesInBuffer = 0;
if (get_local_id(0) == 0)
bufferFull = false;
for (int i = 0; i < BUFFER_GROUPS; ++i)
valid[i*GROUP_SIZE+get_local_id(0)] = false;
barrier(CLK_LOCAL_MEM_FENCE);
const int numTiles = (NUM_BLOCKS*(NUM_BLOCKS+1))/2;
for (int baseIndex = get_group_id(0)*get_local_size(0); baseIndex < numTiles; baseIndex += get_global_size(0)) {
// Identify the pair of blocks to compare.
unsigned int x = tiles[index];
unsigned int y = ((x >> 2) & 0x7fff);
x = (x >> 17);
int index = baseIndex+get_local_id(0);
if (index < numTiles) {
unsigned int y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*index));
unsigned int x = (index-y*NUM_BLOCKS+y*(y+1)/2);
if (x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y++;
x = (index-y*NUM_BLOCKS+y*(y+1)/2);
}
// Find the distance between the bounding boxes of the two cells.
......@@ -61,9 +133,27 @@ __kernel void findBlocksWithInteractions(int numTiles, float cutoffSquared, floa
delta.x = max(0.0f, fabs(delta.x)-boxSizea.x-boxSizeb.x);
delta.y = max(0.0f, fabs(delta.y)-boxSizea.y-boxSizeb.y);
delta.z = max(0.0f, fabs(delta.z)-boxSizea.z-boxSizeb.z);
interactionFlag[index] = (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z > cutoffSquared ? 0 : 1);
index += get_global_size(0);
if (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < cutoffSquared) {
// Add this tile to the buffer.
int bufferIndex = valuesInBuffer*GROUP_SIZE+get_local_id(0);
valid[bufferIndex] = true;
buffer[bufferIndex] = (short2) (x, y);
valuesInBuffer++;
if (!bufferFull && valuesInBuffer == BUFFER_GROUPS)
bufferFull = true;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if (bufferFull) {
storeInteractionData(buffer, valid, sum, sum2, temp, &globalIndex, interactionCount, interactingTiles);
valuesInBuffer = 0;
if (get_local_id(0) == 0)
bufferFull = false;
barrier(CLK_LOCAL_MEM_FENCE);
}
}
storeInteractionData(buffer, valid, sum, sum2, temp, &globalIndex, interactionCount, interactingTiles);
}
/**
......
......@@ -15,9 +15,9 @@ typedef struct {
*/
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeBornSum(__global float* global_bornSum, __global float4* posq, __global float2* global_params, __local AtomData* localData, __local float* tempBuffer, __global unsigned int* tiles,
void computeBornSum(__global float* global_bornSum, __global float4* posq, __global float2* global_params, __local AtomData* localData, __local float* tempBuffer,
#ifdef USE_CUTOFF
__global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
__global unsigned int* tiles, __global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
#else
unsigned int numTiles) {
#endif
......@@ -194,9 +194,9 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer,
__global float4* posq, __global float* global_bornRadii,
__global float* global_bornForce, __local AtomData* localData, __local float4* tempBuffer, __global unsigned int* tiles,
__global float* global_bornForce, __local AtomData* localData, __local float4* tempBuffer,
#ifdef USE_CUTOFF
__global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
__global unsigned int* tiles, __global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
#else
unsigned int numTiles) {
#endif
......
......@@ -15,9 +15,9 @@ typedef struct {
*/
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeBornSum(__global float* global_bornSum, __global float4* posq, __global float2* global_params, __local AtomData* localData, __local float* tempBuffer, __global unsigned int* tiles,
void computeBornSum(__global float* global_bornSum, __global float4* posq, __global float2* global_params, __local AtomData* localData, __local float* tempBuffer,
#ifdef USE_CUTOFF
__global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
__global unsigned int* tiles, __global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
#else
unsigned int numTiles) {
#endif
......@@ -116,7 +116,7 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
localData[get_local_id(0)].bornSum = 0.0f;
#ifdef USE_CUTOFF
unsigned int flags = interactionFlags[pos];
if (flags != 0xFFFFFFFF) {
if (flags != 0xFFFFFFFF && false) { // TODO: Fix this: should be checking for exclusions
if (flags == 0) {
// No interactions in this tile.
}
......@@ -260,9 +260,9 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer,
__global float4* posq, __global float* global_bornRadii,
__global float* global_bornForce, __local AtomData* localData, __local float4* tempBuffer, __global unsigned int* tiles,
__global float* global_bornForce, __local AtomData* localData, __local float4* tempBuffer,
#ifdef USE_CUTOFF
__global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
__global unsigned int* tiles, __global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
#else
unsigned int numTiles) {
#endif
......@@ -366,7 +366,7 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
localData[get_local_id(0)].fw = 0.0f;
#ifdef USE_CUTOFF
unsigned int flags = interactionFlags[pos];
if (flags != 0xFFFFFFFF) {
if (flags != 0xFFFFFFFF && false) { // TODO: Fix this: should be checking for exclusions
if (flags == 0) {
// No interactions in this tile.
}
......
......@@ -13,9 +13,9 @@ typedef struct {
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions,
__global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __local AtomData* localData, __local float4* tempBuffer, __global unsigned int* tiles,
__global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __local AtomData* localData, __local float4* tempBuffer,
#ifdef USE_CUTOFF
__global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize
__global unsigned int* tiles, __global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize
#else
unsigned int numTiles
#endif
......
......@@ -13,9 +13,9 @@ typedef struct {
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions,
__global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __local AtomData* localData, __local float4* tempBuffer, __global unsigned int* tiles,
__global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __local AtomData* localData, __local float4* tempBuffer,
#ifdef USE_CUTOFF
__global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize
__global unsigned int* tiles, __global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize
#else
unsigned int numTiles
#endif
......
......@@ -521,7 +521,7 @@ void testBlockInteractions(bool periodic) {
vector<cl_uint> interactionFlags;
nb.getInteractionCount().download(interactionCount);
int numWithInteractions = interactionCount[0];
vector<bool> hasInteractions(nb.getTiles().getSize(), false);
vector<bool> hasInteractions(numBlocks*(numBlocks+1)/2, false);
nb.getInteractingTiles().download(interactingTiles);
nb.getInteractionFlags().download(interactionFlags);
const unsigned int atoms = clcontext.getPaddedNumAtoms();
......@@ -580,13 +580,10 @@ void testBlockInteractions(bool periodic) {
// Check the tiles that did not have interactions to make sure all atoms are beyond the cutoff.
vector<cl_uint> tiles;
nb.getTiles().download(tiles);
for (int i = 0; i < (int) hasInteractions.size(); i++)
if (!hasInteractions[i]) {
unsigned int tile = tiles[i];
unsigned int x = (tile >> 17);
unsigned int y = ((tile >> 2) & 0x7fff);
unsigned int y = (unsigned int) std::floor(numBlocks+0.5-std::sqrt((numBlocks+0.5)*(numBlocks+0.5)-2*i));
unsigned int x = (i-y*numBlocks+y*(y+1)/2);
for (int atom1 = 0; atom1 < blockSize; ++atom1) {
mm_float4 pos1 = clcontext.getPosq()[x*blockSize+atom1];
for (int atom2 = 0; atom2 < blockSize; ++atom2) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment