Commit 22078b69 authored by Peter Eastman's avatar Peter Eastman
Browse files

Continuing changes to reduce memory use for large systems

parent 0f8da117
...@@ -1705,10 +1705,8 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF ...@@ -1705,10 +1705,8 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getInteractionFlags().getDeviceBuffer()); computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getInteractionFlags().getDeviceBuffer());
computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer()); computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
} }
else { else
computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getTiles().getDeviceBuffer()); computeBornSumKernel.setArg<cl_uint>(index++, cl.getNumAtomBlocks()*(cl.getNumAtomBlocks()+1)/2);
computeBornSumKernel.setArg<cl_uint>(index++, nb.getTiles().getSize());
}
force1Kernel = cl::Kernel(program, "computeGBSAForce1"); force1Kernel = cl::Kernel(program, "computeGBSAForce1");
index = 0; index = 0;
force1Kernel.setArg<cl::Buffer>(index++, cl.getForceBuffers().getDeviceBuffer()); force1Kernel.setArg<cl::Buffer>(index++, cl.getForceBuffers().getDeviceBuffer());
...@@ -1723,10 +1721,8 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF ...@@ -1723,10 +1721,8 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
force1Kernel.setArg<cl::Buffer>(index++, nb.getInteractionFlags().getDeviceBuffer()); force1Kernel.setArg<cl::Buffer>(index++, nb.getInteractionFlags().getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer()); force1Kernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
} }
else { else
force1Kernel.setArg<cl::Buffer>(index++, nb.getTiles().getDeviceBuffer()); force1Kernel.setArg<cl_uint>(index++, cl.getNumAtomBlocks()*(cl.getNumAtomBlocks()+1)/2);
force1Kernel.setArg<cl_uint>(index++, nb.getTiles().getSize());
}
program = cl.createProgram(OpenCLKernelSources::gbsaObcReductions, defines); program = cl.createProgram(OpenCLKernelSources::gbsaObcReductions, defines);
reduceBornSumKernel = cl::Kernel(program, "reduceBornSum"); reduceBornSumKernel = cl::Kernel(program, "reduceBornSum");
reduceBornSumKernel.setArg<cl_int>(0, cl.getPaddedNumAtoms()); reduceBornSumKernel.setArg<cl_int>(0, cl.getPaddedNumAtoms());
...@@ -1753,9 +1749,10 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF ...@@ -1753,9 +1749,10 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
force1Kernel.setArg<mm_float4>(10, cl.getPeriodicBoxSize()); force1Kernel.setArg<mm_float4>(10, cl.getPeriodicBoxSize());
force1Kernel.setArg<mm_float4>(11, cl.getInvPeriodicBoxSize()); force1Kernel.setArg<mm_float4>(11, cl.getInvPeriodicBoxSize());
} }
cl.executeKernel(computeBornSumKernel, nb.getTiles().getSize()*OpenCLContext::TileSize); int numTiles = cl.getNumAtomBlocks()*(cl.getNumAtomBlocks()+1)/2;
cl.executeKernel(computeBornSumKernel, numTiles*OpenCLContext::TileSize);
cl.executeKernel(reduceBornSumKernel, cl.getPaddedNumAtoms()); cl.executeKernel(reduceBornSumKernel, cl.getPaddedNumAtoms());
cl.executeKernel(force1Kernel, nb.getTiles().getSize()*OpenCLContext::TileSize); cl.executeKernel(force1Kernel, numTiles*OpenCLContext::TileSize);
cl.executeKernel(reduceBornForceKernel, cl.getPaddedNumAtoms()); cl.executeKernel(reduceBornForceKernel, cl.getPaddedNumAtoms());
return 0.0; return 0.0;
} }
...@@ -2406,10 +2403,8 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include ...@@ -2406,10 +2403,8 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
pairValueKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer()); pairValueKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
index += 2; // Periodic box size arguments are set when the kernel is executed. index += 2; // Periodic box size arguments are set when the kernel is executed.
} }
else { else
pairValueKernel.setArg<cl::Buffer>(index++, nb.getTiles().getDeviceBuffer()); pairValueKernel.setArg<cl_uint>(index++, cl.getNumAtomBlocks()*(cl.getNumAtomBlocks()+1)/2);
pairValueKernel.setArg<cl_uint>(index++, nb.getTiles().getSize());
}
if (globals != NULL) if (globals != NULL)
pairValueKernel.setArg<cl::Buffer>(index++, globals->getDeviceBuffer()); pairValueKernel.setArg<cl::Buffer>(index++, globals->getDeviceBuffer());
for (int i = 0; i < (int) params->getBuffers().size(); i++) { for (int i = 0; i < (int) params->getBuffers().size(); i++) {
...@@ -2454,10 +2449,8 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include ...@@ -2454,10 +2449,8 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
pairEnergyKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer()); pairEnergyKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
index += 2; // Periodic box size arguments are set when the kernel is executed. index += 2; // Periodic box size arguments are set when the kernel is executed.
} }
else { else
pairEnergyKernel.setArg<cl::Buffer>(index++, nb.getTiles().getDeviceBuffer()); pairEnergyKernel.setArg<cl_uint>(index++, cl.getNumAtomBlocks()*(cl.getNumAtomBlocks()+1)/2);
pairEnergyKernel.setArg<cl_uint>(index++, nb.getTiles().getSize());
}
if (globals != NULL) if (globals != NULL)
pairEnergyKernel.setArg<cl::Buffer>(index++, globals->getDeviceBuffer()); pairEnergyKernel.setArg<cl::Buffer>(index++, globals->getDeviceBuffer());
for (int i = 0; i < (int) params->getBuffers().size(); i++) { for (int i = 0; i < (int) params->getBuffers().size(); i++) {
...@@ -2530,9 +2523,10 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include ...@@ -2530,9 +2523,10 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
pairEnergyKernel.setArg<mm_float4>(12, cl.getPeriodicBoxSize()); pairEnergyKernel.setArg<mm_float4>(12, cl.getPeriodicBoxSize());
pairEnergyKernel.setArg<mm_float4>(13, cl.getInvPeriodicBoxSize()); pairEnergyKernel.setArg<mm_float4>(13, cl.getInvPeriodicBoxSize());
} }
cl.executeKernel(pairValueKernel, nb.getTiles().getSize()*OpenCLContext::TileSize); int numTiles = cl.getNumAtomBlocks()*(cl.getNumAtomBlocks()+1)/2;
cl.executeKernel(pairValueKernel, numTiles*OpenCLContext::TileSize);
cl.executeKernel(perParticleValueKernel, cl.getPaddedNumAtoms()); cl.executeKernel(perParticleValueKernel, cl.getPaddedNumAtoms());
cl.executeKernel(pairEnergyKernel, nb.getTiles().getSize()*OpenCLContext::TileSize); cl.executeKernel(pairEnergyKernel, numTiles*OpenCLContext::TileSize);
cl.executeKernel(perParticleEnergyKernel, cl.getPaddedNumAtoms()); cl.executeKernel(perParticleEnergyKernel, cl.getPaddedNumAtoms());
if (needParameterGradient) if (needParameterGradient)
cl.executeKernel(gradientChainRuleKernel, cl.getPaddedNumAtoms()); cl.executeKernel(gradientChainRuleKernel, cl.getPaddedNumAtoms());
......
...@@ -26,7 +26,6 @@ ...@@ -26,7 +26,6 @@
#include "OpenCLNonbondedUtilities.h" #include "OpenCLNonbondedUtilities.h"
#include "OpenCLArray.h" #include "OpenCLArray.h"
#include "OpenCLCompact.h"
#include "OpenCLKernelSources.h" #include "OpenCLKernelSources.h"
#include "OpenCLExpressionUtilities.h" #include "OpenCLExpressionUtilities.h"
#include <map> #include <map>
...@@ -37,8 +36,8 @@ using namespace OpenMM; ...@@ -37,8 +36,8 @@ using namespace OpenMM;
using namespace std; using namespace std;
OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : context(context), cutoff(-1.0), useCutoff(false), OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : context(context), cutoff(-1.0), useCutoff(false),
numForceBuffers(0), tiles(NULL), exclusionIndices(NULL), exclusionRowIndices(NULL), exclusions(NULL), interactingTiles(NULL), interactionFlags(NULL), numForceBuffers(0), exclusionIndices(NULL), exclusionRowIndices(NULL), exclusions(NULL), interactingTiles(NULL), interactionFlags(NULL),
interactionCount(NULL), blockCenter(NULL), blockBoundingBox(NULL), compact(NULL) { interactionCount(NULL), blockCenter(NULL), blockBoundingBox(NULL) {
// Decide how many force buffers to use. // Decide how many force buffers to use.
forceBufferPerAtomBlock = false; forceBufferPerAtomBlock = false;
...@@ -52,8 +51,6 @@ OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : con ...@@ -52,8 +51,6 @@ OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : con
} }
OpenCLNonbondedUtilities::~OpenCLNonbondedUtilities() { OpenCLNonbondedUtilities::~OpenCLNonbondedUtilities() {
if (tiles != NULL)
delete tiles;
if (exclusionIndices != NULL) if (exclusionIndices != NULL)
delete exclusionIndices; delete exclusionIndices;
if (exclusionRowIndices != NULL) if (exclusionRowIndices != NULL)
...@@ -70,8 +67,6 @@ OpenCLNonbondedUtilities::~OpenCLNonbondedUtilities() { ...@@ -70,8 +67,6 @@ OpenCLNonbondedUtilities::~OpenCLNonbondedUtilities() {
delete blockCenter; delete blockCenter;
if (blockBoundingBox != NULL) if (blockBoundingBox != NULL)
delete blockBoundingBox; delete blockBoundingBox;
if (compact != NULL)
delete compact;
} }
void OpenCLNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel) { void OpenCLNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel) {
...@@ -127,33 +122,6 @@ void OpenCLNonbondedUtilities::initialize(const System& system) { ...@@ -127,33 +122,6 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
int numAtomBlocks = context.getNumAtomBlocks(); int numAtomBlocks = context.getNumAtomBlocks();
int numTiles = numAtomBlocks*(numAtomBlocks+1)/2; int numTiles = numAtomBlocks*(numAtomBlocks+1)/2;
tiles = new OpenCLArray<cl_uint>(context, numTiles, "tiles");
vector<cl_uint> tileVec(tiles->getSize());
unsigned int count = 0;
for (unsigned int y = 0; y < (unsigned int) numAtomBlocks; y++)
for (unsigned int x = y; x < (unsigned int) numAtomBlocks; x++)
tileVec[count++] = (x << 17) | (y << 2);
// Mark which tiles have exclusions.
for (int atom1 = 0; atom1 < (int) atomExclusions.size(); ++atom1) {
int x = atom1/OpenCLContext::TileSize;
for (int j = 0; j < (int) atomExclusions[atom1].size(); ++j) {
int atom2 = atomExclusions[atom1][j];
int y = atom2/OpenCLContext::TileSize;
int index = (x > y ? x+y*numAtomBlocks-y*(y+1)/2 : y+x*numAtomBlocks-x*(x+1)/2);
tileVec[index] |= 1;
}
}
if (context.getPaddedNumAtoms() > context.getNumAtoms()) {
int lastTile = context.getNumAtoms()/OpenCLContext::TileSize;
for (int i = 0; i < numTiles; ++i) {
int x = tileVec[i]>>17;
int y = (tileVec[i]>>2)&0x7FFF;
if (x == lastTile || y == lastTile)
tileVec[i] |= 1;
}
}
// Build a list of indices for the tiles with exclusions. // Build a list of indices for the tiles with exclusions.
...@@ -227,7 +195,6 @@ void OpenCLNonbondedUtilities::initialize(const System& system) { ...@@ -227,7 +195,6 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
} }
} }
atomExclusions.clear(); // We won't use this again, so free the memory it used atomExclusions.clear(); // We won't use this again, so free the memory it used
tiles->upload(tileVec);
exclusions->upload(exclusionVec); exclusions->upload(exclusionVec);
// Create data structures for the neighbor list. // Create data structures for the neighbor list.
...@@ -238,7 +205,6 @@ void OpenCLNonbondedUtilities::initialize(const System& system) { ...@@ -238,7 +205,6 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
interactionCount = new OpenCLArray<cl_uint>(context, 1, "interactionCount"); interactionCount = new OpenCLArray<cl_uint>(context, 1, "interactionCount");
blockCenter = new OpenCLArray<mm_float4>(context, numAtomBlocks, "blockCenter"); blockCenter = new OpenCLArray<mm_float4>(context, numAtomBlocks, "blockCenter");
blockBoundingBox = new OpenCLArray<mm_float4>(context, numAtomBlocks, "blockBoundingBox"); blockBoundingBox = new OpenCLArray<mm_float4>(context, numAtomBlocks, "blockBoundingBox");
compact = new OpenCLCompact(context);
} }
// Create kernels. // Create kernels.
...@@ -246,6 +212,7 @@ void OpenCLNonbondedUtilities::initialize(const System& system) { ...@@ -246,6 +212,7 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
forceKernel = createInteractionKernel(kernelSource, parameters, arguments, true, true); forceKernel = createInteractionKernel(kernelSource, parameters, arguments, true, true);
if (useCutoff) { if (useCutoff) {
map<string, string> defines; map<string, string> defines;
defines["NUM_BLOCKS"] = OpenCLExpressionUtilities::intToString(context.getNumAtomBlocks());
if (forceBufferPerAtomBlock) if (forceBufferPerAtomBlock)
defines["USE_OUTPUT_BUFFER_PER_BLOCK"] = "1"; defines["USE_OUTPUT_BUFFER_PER_BLOCK"] = "1";
if (usePeriodic) if (usePeriodic)
...@@ -256,13 +223,13 @@ void OpenCLNonbondedUtilities::initialize(const System& system) { ...@@ -256,13 +223,13 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
findBlockBoundsKernel.setArg<cl::Buffer>(3, context.getPosq().getDeviceBuffer()); findBlockBoundsKernel.setArg<cl::Buffer>(3, context.getPosq().getDeviceBuffer());
findBlockBoundsKernel.setArg<cl::Buffer>(4, blockCenter->getDeviceBuffer()); findBlockBoundsKernel.setArg<cl::Buffer>(4, blockCenter->getDeviceBuffer());
findBlockBoundsKernel.setArg<cl::Buffer>(5, blockBoundingBox->getDeviceBuffer()); findBlockBoundsKernel.setArg<cl::Buffer>(5, blockBoundingBox->getDeviceBuffer());
findBlockBoundsKernel.setArg<cl::Buffer>(6, interactionCount->getDeviceBuffer());
findInteractingBlocksKernel = cl::Kernel(interactingBlocksProgram, "findBlocksWithInteractions"); findInteractingBlocksKernel = cl::Kernel(interactingBlocksProgram, "findBlocksWithInteractions");
findInteractingBlocksKernel.setArg<cl_int>(0, tiles->getSize()); findInteractingBlocksKernel.setArg<cl_float>(0, (cl_float) (cutoff*cutoff));
findInteractingBlocksKernel.setArg<cl_float>(1, (cl_float) (cutoff*cutoff)); findInteractingBlocksKernel.setArg<cl::Buffer>(3, blockCenter->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(4, tiles->getDeviceBuffer()); findInteractingBlocksKernel.setArg<cl::Buffer>(4, blockBoundingBox->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(5, blockCenter->getDeviceBuffer()); findInteractingBlocksKernel.setArg<cl::Buffer>(5, interactionCount->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(6, blockBoundingBox->getDeviceBuffer()); findInteractingBlocksKernel.setArg<cl::Buffer>(6, interactingTiles->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(7, interactionFlags->getDeviceBuffer());
findInteractionsWithinBlocksKernel = cl::Kernel(interactingBlocksProgram, "findInteractionsWithinBlocks"); findInteractionsWithinBlocksKernel = cl::Kernel(interactingBlocksProgram, "findInteractionsWithinBlocks");
findInteractionsWithinBlocksKernel.setArg<cl_float>(0, (cl_float) (cutoff*cutoff)); findInteractionsWithinBlocksKernel.setArg<cl_float>(0, (cl_float) (cutoff*cutoff));
findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(3, context.getPosq().getDeviceBuffer()); findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(3, context.getPosq().getDeviceBuffer());
...@@ -293,10 +260,9 @@ void OpenCLNonbondedUtilities::prepareInteractions() { ...@@ -293,10 +260,9 @@ void OpenCLNonbondedUtilities::prepareInteractions() {
findBlockBoundsKernel.setArg<mm_float4>(1, context.getPeriodicBoxSize()); findBlockBoundsKernel.setArg<mm_float4>(1, context.getPeriodicBoxSize());
findBlockBoundsKernel.setArg<mm_float4>(2, context.getInvPeriodicBoxSize()); findBlockBoundsKernel.setArg<mm_float4>(2, context.getInvPeriodicBoxSize());
context.executeKernel(findBlockBoundsKernel, context.getNumAtoms()); context.executeKernel(findBlockBoundsKernel, context.getNumAtoms());
findInteractingBlocksKernel.setArg<mm_float4>(2, context.getPeriodicBoxSize()); findInteractingBlocksKernel.setArg<mm_float4>(1, context.getPeriodicBoxSize());
findInteractingBlocksKernel.setArg<mm_float4>(3, context.getInvPeriodicBoxSize()); findInteractingBlocksKernel.setArg<mm_float4>(2, context.getInvPeriodicBoxSize());
context.executeKernel(findInteractingBlocksKernel, context.getNumAtoms()); context.executeKernel(findInteractingBlocksKernel, context.getNumAtoms());
compact->compactStream(*interactingTiles, *tiles, *interactionFlags, *interactionCount);
if (context.getSIMDWidth() == 32) { if (context.getSIMDWidth() == 32) {
findInteractionsWithinBlocksKernel.setArg<mm_float4>(1, context.getPeriodicBoxSize()); findInteractionsWithinBlocksKernel.setArg<mm_float4>(1, context.getPeriodicBoxSize());
findInteractionsWithinBlocksKernel.setArg<mm_float4>(2, context.getInvPeriodicBoxSize()); findInteractionsWithinBlocksKernel.setArg<mm_float4>(2, context.getInvPeriodicBoxSize());
...@@ -305,12 +271,12 @@ void OpenCLNonbondedUtilities::prepareInteractions() { ...@@ -305,12 +271,12 @@ void OpenCLNonbondedUtilities::prepareInteractions() {
} }
void OpenCLNonbondedUtilities::computeInteractions() { void OpenCLNonbondedUtilities::computeInteractions() {
if (tiles != NULL) { if (cutoff != -1.0) {
if (useCutoff) { if (useCutoff) {
forceKernel.setArg<mm_float4>(11, context.getPeriodicBoxSize()); forceKernel.setArg<mm_float4>(11, context.getPeriodicBoxSize());
forceKernel.setArg<mm_float4>(12, context.getInvPeriodicBoxSize()); forceKernel.setArg<mm_float4>(12, context.getInvPeriodicBoxSize());
} }
context.executeKernel(forceKernel, tiles->getSize()*OpenCLContext::TileSize); context.executeKernel(forceKernel, (context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2)*OpenCLContext::TileSize);
} }
} }
...@@ -443,8 +409,7 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc ...@@ -443,8 +409,7 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
index += 2; // The periodic box size arguments are set when the kernel is executed. index += 2; // The periodic box size arguments are set when the kernel is executed.
} }
else { else {
kernel.setArg<cl::Buffer>(index++, tiles->getDeviceBuffer()); kernel.setArg<cl_uint>(index++, context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2);
kernel.setArg<cl_uint>(index++, tiles->getSize());
} }
for (int i = 0; i < (int) params.size(); i++) { for (int i = 0; i < (int) params.size(); i++) {
kernel.setArg<cl::Memory>(index++, params[i].getMemory()); kernel.setArg<cl::Memory>(index++, params[i].getMemory());
......
...@@ -35,8 +35,6 @@ ...@@ -35,8 +35,6 @@
namespace OpenMM { namespace OpenMM {
class OpenCLCompact;
/** /**
* This class provides a generic interface for calculating nonbonded interactions. It does this in two * This class provides a generic interface for calculating nonbonded interactions. It does this in two
* ways. First, it can be used to create Kernels that evaluate nonbonded interactions. Clients * ways. First, it can be used to create Kernels that evaluate nonbonded interactions. Clients
...@@ -141,12 +139,6 @@ public: ...@@ -141,12 +139,6 @@ public:
OpenCLArray<mm_float4>& getBlockBoundingBoxes() { OpenCLArray<mm_float4>& getBlockBoundingBoxes() {
return *blockBoundingBox; return *blockBoundingBox;
} }
/**
* Get the array containing the full set of tiles.
*/
OpenCLArray<cl_uint>& getTiles() {
return *tiles;
}
/** /**
* Get the array whose first element contains the number of tiles with interactions. * Get the array whose first element contains the number of tiles with interactions.
*/ */
...@@ -202,7 +194,6 @@ private: ...@@ -202,7 +194,6 @@ private:
cl::Kernel findBlockBoundsKernel; cl::Kernel findBlockBoundsKernel;
cl::Kernel findInteractingBlocksKernel; cl::Kernel findInteractingBlocksKernel;
cl::Kernel findInteractionsWithinBlocksKernel; cl::Kernel findInteractionsWithinBlocksKernel;
OpenCLArray<cl_uint>* tiles;
OpenCLArray<cl_uint>* exclusions; OpenCLArray<cl_uint>* exclusions;
OpenCLArray<cl_uint>* exclusionIndices; OpenCLArray<cl_uint>* exclusionIndices;
OpenCLArray<cl_uint>* exclusionRowIndices; OpenCLArray<cl_uint>* exclusionRowIndices;
...@@ -214,7 +205,6 @@ private: ...@@ -214,7 +205,6 @@ private:
std::vector<std::vector<int> > atomExclusions; std::vector<std::vector<int> > atomExclusions;
std::vector<ParameterInfo> parameters; std::vector<ParameterInfo> parameters;
std::vector<ParameterInfo> arguments; std::vector<ParameterInfo> arguments;
OpenCLCompact* compact;
std::string kernelSource; std::string kernelSource;
std::map<std::string, std::string> kernelDefines; std::map<std::string, std::string> kernelDefines;
double cutoff; double cutoff;
......
...@@ -9,9 +9,9 @@ ...@@ -9,9 +9,9 @@
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1))) __kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer, __local float4* local_force, void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer, __local float4* local_force,
__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions, __global unsigned int* exclusionIndices, __global float4* posq, __local float4* local_posq, __global unsigned int* exclusions, __global unsigned int* exclusionIndices,
__global unsigned int* exclusionRowIndices, __local float4* tempForceBuffer, __global unsigned int* tiles, __global unsigned int* exclusionRowIndices, __local float4* tempForceBuffer,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize __global unsigned int* tiles, __global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize
#else #else
unsigned int numTiles unsigned int numTiles
#endif #endif
......
...@@ -9,9 +9,9 @@ ...@@ -9,9 +9,9 @@
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1))) __kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer, __local float4* local_force, void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer, __local float4* local_force,
__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions, __global unsigned int* exclusionIndices, __global float4* posq, __local float4* local_posq, __global unsigned int* exclusions, __global unsigned int* exclusionIndices,
__global unsigned int* exclusionRowIndices, __local float4* tempBuffer, __global unsigned int* tiles, __global unsigned int* exclusionRowIndices, __local float4* tempBuffer,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize __global unsigned int* tiles, __global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize
#else #else
unsigned int numTiles unsigned int numTiles
#endif #endif
......
...@@ -7,9 +7,9 @@ ...@@ -7,9 +7,9 @@
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1))) __kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeN2Value(__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions, void computeN2Value(__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions,
__global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __global float* global_value, __local float* local_value, __global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __global float* global_value, __local float* local_value,
__local float* tempBuffer, __global unsigned int* tiles, __local float* tempBuffer,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize __global unsigned int* tiles, __global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize
#else #else
unsigned int numTiles unsigned int numTiles
#endif #endif
......
...@@ -7,9 +7,9 @@ ...@@ -7,9 +7,9 @@
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1))) __kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeN2Value(__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions, void computeN2Value(__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions,
__global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __global float* global_value, __local float* local_value, __global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __global float* global_value, __local float* local_value,
__local float* tempBuffer, __global unsigned int* tiles, __local float* tempBuffer,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize __global unsigned int* tiles, __global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize
#else #else
unsigned int numTiles unsigned int numTiles
#endif #endif
......
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#define TILE_SIZE 32 #define TILE_SIZE 32
#define GROUP_SIZE 64
#define BUFFER_GROUPS 4
#define BUFFER_SIZE BUFFER_GROUPS*GROUP_SIZE
/** /**
* Find a bounding box for the atoms in each block. * Find a bounding box for the atoms in each block.
*/ */
__kernel void findBlockBounds(int numAtoms, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global float4* posq, __global float4* blockCenter, __global float4* blockBoundingBox) { __kernel void findBlockBounds(int numAtoms, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global float4* posq, __global float4* blockCenter, __global float4* blockBoundingBox, __global unsigned int* interactionCount) {
int index = get_global_id(0); int index = get_global_id(0);
int base = index*TILE_SIZE; int base = index*TILE_SIZE;
while (base < numAtoms) { while (base < numAtoms) {
...@@ -32,21 +36,89 @@ __kernel void findBlockBounds(int numAtoms, float4 periodicBoxSize, float4 invPe ...@@ -32,21 +36,89 @@ __kernel void findBlockBounds(int numAtoms, float4 periodicBoxSize, float4 invPe
index += get_global_size(0); index += get_global_size(0);
base = index*TILE_SIZE; base = index*TILE_SIZE;
} }
if (get_global_id(0) == 0)
interactionCount[0] = 0;
}
/**
* This is called by findBlocksWithInteractions(). It compacts the list of blocks and writes them
* to global memory.
*/
void storeInteractionData(__local short2* buffer, __local bool* valid, __local int* sum, __local int* sum2, __local short2* temp, __local int* baseIndex,
__global unsigned int* interactionCount, __global unsigned int* interactingTiles) {
// The buffer is full, so we need to compact it and write out results. Start by doing a parallel prefix sum.
for (int i = get_local_id(0); i < BUFFER_SIZE; i += GROUP_SIZE)
sum[i] = (valid[i] ? 1 : 0);
barrier(CLK_LOCAL_MEM_FENCE);
int whichBuffer = 0;
for (int offset = 1; offset < BUFFER_SIZE; offset *= 2) {
if (whichBuffer == 0)
for (int i = get_local_id(0); i < BUFFER_SIZE; i += GROUP_SIZE)
sum2[i] = (i < offset ? sum[i] : sum[i]+sum[i-offset]);
else
for (int i = get_local_id(0); i < BUFFER_SIZE; i += GROUP_SIZE)
sum[i] = (i < offset ? sum2[i] : sum2[i]+sum2[i-offset]);
whichBuffer = 1-whichBuffer;
barrier(CLK_LOCAL_MEM_FENCE);
}
if (whichBuffer == 1) {
for (int i = get_local_id(0); i < BUFFER_SIZE; i += GROUP_SIZE)
sum[i] = sum2[i];
barrier(CLK_LOCAL_MEM_FENCE);
}
// Compact the buffer and store it to global memory.
for (int i = get_local_id(0); i < BUFFER_SIZE; i += GROUP_SIZE)
if (valid[i]) {
temp[sum[i]-1] = buffer[i];
valid[i] = false;
}
barrier(CLK_LOCAL_MEM_FENCE);
int numValid = sum[BUFFER_SIZE-1];
if (get_local_id(0) == 0)
*baseIndex = atom_add(interactionCount, numValid);
barrier(CLK_LOCAL_MEM_FENCE);
// Store it to global memory.
for (int i = get_local_id(0); i < numValid; i += GROUP_SIZE)
interactingTiles[*baseIndex+i] = (temp[i].x<<17)+(temp[i].y<<2);
barrier(CLK_LOCAL_MEM_FENCE);
} }
/** /**
* Compare the bounding boxes for each pair of blocks. If they are sufficiently far apart, * Compare the bounding boxes for each pair of blocks. If they are sufficiently far apart,
* mark them as non-interacting. * mark them as non-interacting.
*/ */
__kernel void findBlocksWithInteractions(int numTiles, float cutoffSquared, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global unsigned int* tiles, __global float4* blockCenter, __kernel void findBlocksWithInteractions(float cutoffSquared, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global float4* blockCenter,
__global float4* blockBoundingBox, __global unsigned int* interactionFlag) { __global float4* blockBoundingBox, __global unsigned int* interactionCount, __global unsigned int* interactingTiles) {
int index = get_global_id(0); __local short2 buffer[BUFFER_SIZE];
while (index < numTiles) { __local bool valid[BUFFER_SIZE];
// Extract cell coordinates from appropriate work unit __local int sum[BUFFER_SIZE];
__local int sum2[BUFFER_SIZE];
__local short2 temp[BUFFER_SIZE];
__local int bufferFull;
__local int globalIndex;
int valuesInBuffer = 0;
if (get_local_id(0) == 0)
bufferFull = false;
for (int i = 0; i < BUFFER_GROUPS; ++i)
valid[i*GROUP_SIZE+get_local_id(0)] = false;
barrier(CLK_LOCAL_MEM_FENCE);
const int numTiles = (NUM_BLOCKS*(NUM_BLOCKS+1))/2;
for (int baseIndex = get_group_id(0)*get_local_size(0); baseIndex < numTiles; baseIndex += get_global_size(0)) {
// Identify the pair of blocks to compare.
unsigned int x = tiles[index]; int index = baseIndex+get_local_id(0);
unsigned int y = ((x >> 2) & 0x7fff); if (index < numTiles) {
x = (x >> 17); unsigned int y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*index));
unsigned int x = (index-y*NUM_BLOCKS+y*(y+1)/2);
if (x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y++;
x = (index-y*NUM_BLOCKS+y*(y+1)/2);
}
// Find the distance between the bounding boxes of the two cells. // Find the distance between the bounding boxes of the two cells.
...@@ -61,9 +133,27 @@ __kernel void findBlocksWithInteractions(int numTiles, float cutoffSquared, floa ...@@ -61,9 +133,27 @@ __kernel void findBlocksWithInteractions(int numTiles, float cutoffSquared, floa
delta.x = max(0.0f, fabs(delta.x)-boxSizea.x-boxSizeb.x); delta.x = max(0.0f, fabs(delta.x)-boxSizea.x-boxSizeb.x);
delta.y = max(0.0f, fabs(delta.y)-boxSizea.y-boxSizeb.y); delta.y = max(0.0f, fabs(delta.y)-boxSizea.y-boxSizeb.y);
delta.z = max(0.0f, fabs(delta.z)-boxSizea.z-boxSizeb.z); delta.z = max(0.0f, fabs(delta.z)-boxSizea.z-boxSizeb.z);
interactionFlag[index] = (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z > cutoffSquared ? 0 : 1); if (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < cutoffSquared) {
index += get_global_size(0); // Add this tile to the buffer.
int bufferIndex = valuesInBuffer*GROUP_SIZE+get_local_id(0);
valid[bufferIndex] = true;
buffer[bufferIndex] = (short2) (x, y);
valuesInBuffer++;
if (!bufferFull && valuesInBuffer == BUFFER_GROUPS)
bufferFull = true;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if (bufferFull) {
storeInteractionData(buffer, valid, sum, sum2, temp, &globalIndex, interactionCount, interactingTiles);
valuesInBuffer = 0;
if (get_local_id(0) == 0)
bufferFull = false;
barrier(CLK_LOCAL_MEM_FENCE);
}
} }
storeInteractionData(buffer, valid, sum, sum2, temp, &globalIndex, interactionCount, interactingTiles);
} }
/** /**
......
...@@ -15,9 +15,9 @@ typedef struct { ...@@ -15,9 +15,9 @@ typedef struct {
*/ */
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1))) __kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeBornSum(__global float* global_bornSum, __global float4* posq, __global float2* global_params, __local AtomData* localData, __local float* tempBuffer, __global unsigned int* tiles, void computeBornSum(__global float* global_bornSum, __global float4* posq, __global float2* global_params, __local AtomData* localData, __local float* tempBuffer,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize) { __global unsigned int* tiles, __global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
#else #else
unsigned int numTiles) { unsigned int numTiles) {
#endif #endif
...@@ -194,9 +194,9 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo ...@@ -194,9 +194,9 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1))) __kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer, void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer,
__global float4* posq, __global float* global_bornRadii, __global float4* posq, __global float* global_bornRadii,
__global float* global_bornForce, __local AtomData* localData, __local float4* tempBuffer, __global unsigned int* tiles, __global float* global_bornForce, __local AtomData* localData, __local float4* tempBuffer,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize) { __global unsigned int* tiles, __global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
#else #else
unsigned int numTiles) { unsigned int numTiles) {
#endif #endif
......
...@@ -15,9 +15,9 @@ typedef struct { ...@@ -15,9 +15,9 @@ typedef struct {
*/ */
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1))) __kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeBornSum(__global float* global_bornSum, __global float4* posq, __global float2* global_params, __local AtomData* localData, __local float* tempBuffer, __global unsigned int* tiles, void computeBornSum(__global float* global_bornSum, __global float4* posq, __global float2* global_params, __local AtomData* localData, __local float* tempBuffer,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize) { __global unsigned int* tiles, __global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
#else #else
unsigned int numTiles) { unsigned int numTiles) {
#endif #endif
...@@ -116,7 +116,7 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo ...@@ -116,7 +116,7 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
localData[get_local_id(0)].bornSum = 0.0f; localData[get_local_id(0)].bornSum = 0.0f;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int flags = interactionFlags[pos]; unsigned int flags = interactionFlags[pos];
if (flags != 0xFFFFFFFF) { if (flags != 0xFFFFFFFF && false) { // TODO: Fix this: should be checking for exclusions
if (flags == 0) { if (flags == 0) {
// No interactions in this tile. // No interactions in this tile.
} }
...@@ -260,9 +260,9 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo ...@@ -260,9 +260,9 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1))) __kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer, void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer,
__global float4* posq, __global float* global_bornRadii, __global float4* posq, __global float* global_bornRadii,
__global float* global_bornForce, __local AtomData* localData, __local float4* tempBuffer, __global unsigned int* tiles, __global float* global_bornForce, __local AtomData* localData, __local float4* tempBuffer,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize) { __global unsigned int* tiles, __global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
#else #else
unsigned int numTiles) { unsigned int numTiles) {
#endif #endif
...@@ -366,7 +366,7 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff ...@@ -366,7 +366,7 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
localData[get_local_id(0)].fw = 0.0f; localData[get_local_id(0)].fw = 0.0f;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int flags = interactionFlags[pos]; unsigned int flags = interactionFlags[pos];
if (flags != 0xFFFFFFFF) { if (flags != 0xFFFFFFFF && false) { // TODO: Fix this: should be checking for exclusions
if (flags == 0) { if (flags == 0) {
// No interactions in this tile. // No interactions in this tile.
} }
......
...@@ -13,9 +13,9 @@ typedef struct { ...@@ -13,9 +13,9 @@ typedef struct {
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1))) __kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions, void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions,
__global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __local AtomData* localData, __local float4* tempBuffer, __global unsigned int* tiles, __global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __local AtomData* localData, __local float4* tempBuffer,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize __global unsigned int* tiles, __global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize
#else #else
unsigned int numTiles unsigned int numTiles
#endif #endif
......
...@@ -13,9 +13,9 @@ typedef struct { ...@@ -13,9 +13,9 @@ typedef struct {
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1))) __kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions, void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions,
__global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __local AtomData* localData, __local float4* tempBuffer, __global unsigned int* tiles, __global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __local AtomData* localData, __local float4* tempBuffer,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize __global unsigned int* tiles, __global unsigned int* interactionFlags, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize
#else #else
unsigned int numTiles unsigned int numTiles
#endif #endif
......
...@@ -521,7 +521,7 @@ void testBlockInteractions(bool periodic) { ...@@ -521,7 +521,7 @@ void testBlockInteractions(bool periodic) {
vector<cl_uint> interactionFlags; vector<cl_uint> interactionFlags;
nb.getInteractionCount().download(interactionCount); nb.getInteractionCount().download(interactionCount);
int numWithInteractions = interactionCount[0]; int numWithInteractions = interactionCount[0];
vector<bool> hasInteractions(nb.getTiles().getSize(), false); vector<bool> hasInteractions(numBlocks*(numBlocks+1)/2, false);
nb.getInteractingTiles().download(interactingTiles); nb.getInteractingTiles().download(interactingTiles);
nb.getInteractionFlags().download(interactionFlags); nb.getInteractionFlags().download(interactionFlags);
const unsigned int atoms = clcontext.getPaddedNumAtoms(); const unsigned int atoms = clcontext.getPaddedNumAtoms();
...@@ -580,13 +580,10 @@ void testBlockInteractions(bool periodic) { ...@@ -580,13 +580,10 @@ void testBlockInteractions(bool periodic) {
// Check the tiles that did not have interactions to make sure all atoms are beyond the cutoff. // Check the tiles that did not have interactions to make sure all atoms are beyond the cutoff.
vector<cl_uint> tiles;
nb.getTiles().download(tiles);
for (int i = 0; i < (int) hasInteractions.size(); i++) for (int i = 0; i < (int) hasInteractions.size(); i++)
if (!hasInteractions[i]) { if (!hasInteractions[i]) {
unsigned int tile = tiles[i]; unsigned int y = (unsigned int) std::floor(numBlocks+0.5-std::sqrt((numBlocks+0.5)*(numBlocks+0.5)-2*i));
unsigned int x = (tile >> 17); unsigned int x = (i-y*numBlocks+y*(y+1)/2);
unsigned int y = ((tile >> 2) & 0x7fff);
for (int atom1 = 0; atom1 < blockSize; ++atom1) { for (int atom1 = 0; atom1 < blockSize; ++atom1) {
mm_float4 pos1 = clcontext.getPosq()[x*blockSize+atom1]; mm_float4 pos1 = clcontext.getPosq()[x*blockSize+atom1];
for (int atom2 = 0; atom2 < blockSize; ++atom2) { for (int atom2 = 0; atom2 < blockSize; ++atom2) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment