Commit c3125421 authored by Peter Eastman's avatar Peter Eastman
Browse files

Continuing implementation of nonbonded forces

parent d95e723d
......@@ -139,6 +139,8 @@ public:
* Copy the values in the Buffer to a vector.
*/
void download(std::vector<T>& data) const {
if (data.size() != size)
data.resize(size);
try {
context.getQueue().enqueueReadBuffer(*buffer, CL_TRUE, 0, size*sizeof(T), &data[0]);
}
......
/* Code for OpenCL stream compaction. Roughly based on:
Billeter M, Olsson O, Assarsson U. Efficient Stream Compaction on Wide SIMD Many-Core Architectures.
High Performance Graphics 2009.
Notes:
- paper recommends 128 threads/block, so this is hard coded.
- I only implement the prefix-sum based compact primitive, and not the POPC one, as that is more
complicated and performs poorly on current hardware
- I only implement the scattered- and staged-write variant of phase III as it they have reasonable
performance across most of the tested workloads in the paper. The selective variant is not
implemented.
- The prefix sum of per-block element counts (phase II) is not done in a particularly efficient
manner. It is, however, done in a very easy to program manner, and integrated into the top of
phase III, reducing the number of kernel invocations required. If one wanted to use existing code,
it'd be easy to take the CUDA SDK scanLargeArray sample, and do a prefix sum over dgBlockCounts in
a phase II kernel. You could also adapt the existing prescan128 to take an initial value, and scan
dgBlockCounts in stages.
Date: 23 Aug 2009
Author: CUDA version by Imran Haque (ihaque@cs.stanford.edu), converted to OpenCL by Peter Eastman
Affiliation: Stanford University
License: Public Domain
*/
#include "OpenCLCompact.h"
using namespace OpenMM;
OpenCLCompact::OpenCLCompact(OpenCLContext& context) : context(context), dgBlockCounts(NULL) {
dgBlockCounts = new OpenCLArray<cl_uint>(context, context.getNumThreadBlocks(), "dgBlockCounts");
cl::Program program = context.createProgram(context.loadSourceFromFile("compact.cl"));
countKernel = cl::Kernel(program, "countElts");
moveValidKernel = cl::Kernel(program, "moveValidElementsStaged");
}
OpenCLCompact::~OpenCLCompact() {
if (dgBlockCounts != NULL)
delete dgBlockCounts;
}
void OpenCLCompact::compactStream(OpenCLArray<cl_uint>& dOut, OpenCLArray<cl_uint>& dIn, OpenCLArray<cl_uint>& dValid, OpenCLArray<cl_uint>& numValid) {
// Figure out # elements per block
int len = dIn.getSize();
unsigned int numBlocks = context.getNumThreadBlocks();
if (numBlocks*128 > len)
numBlocks = (len+127)/128;
const size_t eltsPerBlock = len/numBlocks + ((len % numBlocks) ? 1 : 0);
// TODO: implement loop over blocks of 10M
// Phase 1: Calculate number of valid elements per thread block
countKernel.setArg<cl::Buffer>(0, dgBlockCounts->getDeviceBuffer());
countKernel.setArg<cl::Buffer>(1, dValid.getDeviceBuffer());
countKernel.setArg<cl_uint>(2, len);
countKernel.setArg(3, 128*sizeof(cl_uint), NULL);
context.executeKernel(countKernel, len, 128);
// Phase 2/3: Move valid elements using SIMD compaction
moveValidKernel.setArg<cl::Buffer>(0, dIn.getDeviceBuffer());
moveValidKernel.setArg<cl::Buffer>(1, dOut.getDeviceBuffer());
moveValidKernel.setArg<cl::Buffer>(2, dValid.getDeviceBuffer());
moveValidKernel.setArg<cl::Buffer>(3, dgBlockCounts->getDeviceBuffer());
moveValidKernel.setArg<cl_uint>(4, len);
moveValidKernel.setArg<cl::Buffer>(5, numValid.getDeviceBuffer());
moveValidKernel.setArg(6, 128*sizeof(cl_uint), NULL);
moveValidKernel.setArg(7, 128*sizeof(cl_uint), NULL);
moveValidKernel.setArg(8, 128*sizeof(cl_uint), NULL);
context.executeKernel(moveValidKernel, len, 128);
}
#ifndef __OPENMM_OPENCLCOMPACT_H__
#define __OPENMM_OPENCLCOMPACT_H__
/* Code for OPENCL stream compaction. Roughly based on:
Billeter M, Olsson O, Assarsson U. Efficient Stream Compaction on Wide SIMD Many-Core Architectures.
High Performance Graphics 2009.
Notes:
- paper recommends 128 threads/block, so this is hard coded.
- I only implement the prefix-sum based compact primitive, and not the POPC one, as that is more
complicated and performs poorly on current hardware
- I only implement the scattered- and staged-write variant of phase III as it they have reasonable
performance across most of the tested workloads in the paper. The selective variant is not
implemented.
- The prefix sum of per-block element counts (phase II) is not done in a particularly efficient
manner. It is, however, done in a very easy to program manner, and integrated into the top of
phase III, reducing the number of kernel invocations required. If one wanted to use existing code,
it'd be easy to take the CUDA SDK scanLargeArray sample, and do a prefix sum over dgBlockCounts in
a phase II kernel. You could also adapt the existing prescan128 to take an initial value, and scan
dgBlockCounts in stages.
Date: 23 Aug 2009
Author: CUDA version by Imran Haque (ihaque@cs.stanford.edu), converted to OpenCL by Peter Eastman
Affiliation: Stanford University
License: Public Domain
*/
#include "OpenCLArray.h"
namespace OpenMM {
class OpenCLCompact {
public:
OpenCLCompact(OpenCLContext& context);
~OpenCLCompact();
void compactStream(OpenCLArray<cl_uint>& dOut, OpenCLArray<cl_uint>& dIn, OpenCLArray<cl_uint>& dValid, OpenCLArray<cl_uint>& numValid);
private:
OpenCLContext& context;
OpenCLArray<cl_uint>* dgBlockCounts;
cl::Kernel countKernel;
cl::Kernel moveValidKernel;
};
} // namespace OpenMM
#endif // __OPENMM_OPENCLCOMPACT_H__
\ No newline at end of file
......@@ -58,6 +58,9 @@ OpenCLContext::OpenCLContext(int numParticles, int deviceIndex) : time(0.0), ste
device = devices[deviceIndex];
if (device.getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>()[0] < minThreadBlockSize)
throw OpenMMException("The specified OpenCL device is not compatible with OpenMM");
compilationOptions = "-cl-fast-relaxed-math";
if (device.getInfo<CL_DEVICE_VENDOR>() == "NVIDIA")
compilationOptions += " -DWARPS_ARE_ATOMIC";
queue = cl::CommandQueue(context, device);
numAtoms = numParticles;
paddedNumAtoms = TileSize*((numParticles+TileSize-1)/TileSize);
......@@ -132,21 +135,30 @@ string OpenCLContext::loadSourceFromFile(const string& filename) const {
return kernel;
}
cl::Program OpenCLContext::createProgram(const std::string source) {
cl::Program OpenCLContext::createProgram(const string source) {
return createProgram(source, map<string, string>());
}
cl::Program OpenCLContext::createProgram(const string source, const map<string, string>& defines) {
cl::Program::Sources sources(1, make_pair(source.c_str(), source.size()));
cl::Program program(context, sources);
string options = compilationOptions;
for (map<string, string>::const_iterator iter = defines.begin(); iter != defines.end(); ++iter)
options += " -D"+iter->first+"="+iter->second;
try {
program.build(vector<cl::Device>(1, device));
program.build(vector<cl::Device>(1, device), options.c_str());
} catch (cl::Error err) {
throw OpenMMException("Error compiling kernel: "+program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device));
}
return program;
}
void OpenCLContext::executeKernel(cl::Kernel& kernel, int workUnits) {
int size = std::min((workUnits+ThreadBlockSize-1)/ThreadBlockSize, numThreadBlocks)*ThreadBlockSize;
void OpenCLContext::executeKernel(cl::Kernel& kernel, int workUnits, int workUnitSize) {
if (workUnitSize == -1)
workUnitSize = ThreadBlockSize;
int size = std::min((workUnits+workUnitSize-1)/workUnitSize, numThreadBlocks)*workUnitSize;
try {
queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(size), cl::NDRange(ThreadBlockSize));
queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(size), cl::NDRange(workUnitSize));
}
catch (cl::Error err) {
stringstream str;
......
......@@ -28,6 +28,8 @@
* -------------------------------------------------------------------------- */
#define __CL_ENABLE_EXCEPTIONS
#include <map>
#include <string>
#include <cl.hpp>
namespace OpenMM {
......@@ -139,13 +141,20 @@ public:
* Create an OpenCL Program from source code.
*/
cl::Program createProgram(const std::string source);
/**
* Create an OpenCL Program from source code.
*
* @param defines a set of preprocessor definitions (name, value) to define when compiling the program
*/
cl::Program createProgram(const std::string source, const std::map<std::string, std::string>& defines);
/**
* Execute a kernel.
*
* @param kernel the kernel to execute
* @param workUnits the maximum number of work units that should be used
* @param kernel the kernel to execute
* @param workUnits the maximum number of work units that should be used
* @param workUnitSize the size of each work unit to use
*/
void executeKernel(cl::Kernel& kernel, int workUnits);
void executeKernel(cl::Kernel& kernel, int workUnits, int workUnitSize = -1);
/**
* Set all elements of an array to 0.
*/
......@@ -226,7 +235,7 @@ public:
/**
* Get the OpenCLNonbondedUtilities for this context.
*/
OpenCLNonbondedUtilities& getNonbondedUtilties() {
OpenCLNonbondedUtilities& getNonbondedUtilities() {
return *nonbonded;
}
private:
......@@ -237,6 +246,7 @@ private:
int numAtomBlocks;
int numThreadBlocks;
int numForceBuffers;
std::string compilationOptions;
cl::Context context;
cl::Device device;
cl::CommandQueue queue;
......
......@@ -47,17 +47,17 @@ void OpenCLCalcForcesAndEnergyKernel::initialize(const System& system) {
void OpenCLCalcForcesAndEnergyKernel::beginForceComputation(ContextImpl& context) {
cl.clearBuffer(cl.getForceBuffers());
cl.getNonbondedUtilties().prepareInteractions();
cl.getNonbondedUtilities().prepareInteractions();
}
void OpenCLCalcForcesAndEnergyKernel::finishForceComputation(ContextImpl& context) {
cl.reduceBuffer(cl.getForceBuffers(), cl.getNumForceBuffers());
cl.getNonbondedUtilties().prepareInteractions();
cl.getNonbondedUtilities().prepareInteractions();
}
void OpenCLCalcForcesAndEnergyKernel::beginEnergyComputation(ContextImpl& context) {
cl.clearBuffer(cl.getEnergyBuffer());
cl.getNonbondedUtilties().prepareInteractions();
cl.getNonbondedUtilities().prepareInteractions();
}
double OpenCLCalcForcesAndEnergyKernel::finishEnergyComputation(ContextImpl& context) {
......@@ -566,8 +566,8 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
// }
// data.nonbondedMethod = method;
// gpuSetCoulombParameters(gpu, 138.935485f, particle, c6, c12, q, symbol, exclusionList, method);
cl.getNonbondedUtilties().addInteraction(useCutoff, usePeriodic, force.getCutoffDistance(), exclusionList);
cl.getNonbondedUtilties().addParameter("sigmaEpsilon", "float2", 8, sigmaEpsilon->getDeviceBuffer());
cl.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, force.getCutoffDistance(), exclusionList);
cl.getNonbondedUtilities().addParameter("sigmaEpsilon", "float2", 8, sigmaEpsilon->getDeviceBuffer());
cutoffSquared = force.getCutoffDistance()*force.getCutoffDistance();
// Compute the Ewald self energy.
......@@ -582,7 +582,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
// Initialize the exceptions.
int numExceptions = exceptions.size();
int maxBuffers = 1;
int maxBuffers = cl.getNonbondedUtilities().getNumForceBuffers();
if (numExceptions > 0) {
exceptionParams = new OpenCLArray<mm_float4>(cl, numExceptions, "exceptionParams");
exceptionIndices = new OpenCLArray<mm_int4>(cl, numExceptions, "exceptionIndices");
......@@ -607,13 +607,13 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
}
void OpenCLCalcNonbondedForceKernel::executeForces(ContextImpl& context) {
cl.getNonbondedUtilties().computeInteractions();
cl.getNonbondedUtilities().computeInteractions();
if (exceptionIndices != NULL) {
int numExceptions = exceptionIndices->getSize();
exceptionsKernel.setArg<cl_int>(0, cl.getPaddedNumAtoms());
exceptionsKernel.setArg<cl_int>(1, numExceptions);
exceptionsKernel.setArg<cl_float>(2, cutoffSquared);
exceptionsKernel.setArg<mm_float4>(3, cl.getNonbondedUtilties().getPeriodicBoxSize());
exceptionsKernel.setArg<mm_float4>(3, cl.getNonbondedUtilities().getPeriodicBoxSize());
exceptionsKernel.setArg<cl::Buffer>(4, cl.getForceBuffers().getDeviceBuffer());
exceptionsKernel.setArg<cl::Buffer>(5, cl.getEnergyBuffer().getDeviceBuffer());
exceptionsKernel.setArg<cl::Buffer>(6, cl.getPosq().getDeviceBuffer());
......
......@@ -26,13 +26,25 @@
#include "OpenCLNonbondedUtilities.h"
#include "OpenCLArray.h"
#include "OpenCLCompact.h"
#include <map>
using namespace OpenMM;
using namespace std;
OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : context(context), cutoff(-1.0), useCutoff(false),
numForceBuffers(0), tiles(NULL), exclusionIndex(NULL), exclusions(NULL) {
numForceBuffers(0), tiles(NULL), exclusionIndex(NULL), exclusions(NULL), interactingTiles(NULL), interactionFlags(NULL),
interactionCount(NULL), blockCenter(NULL), blockBoundingBox(NULL), compact(NULL) {
// Decide how many force buffers to use.
forceBufferPerAtomBlock = false;
numForceBuffers = context.getNumThreadBlocks()*OpenCLContext::ThreadBlockSize/OpenCLContext::TileSize;
if (numForceBuffers >= context.getNumAtomBlocks()) {
// For small systems, it is more efficient to have one force buffer per block of 32 atoms instead of one per warp.
forceBufferPerAtomBlock = true;
numForceBuffers = context.getNumAtomBlocks();
}
}
OpenCLNonbondedUtilities::~OpenCLNonbondedUtilities() {
......@@ -42,6 +54,18 @@ OpenCLNonbondedUtilities::~OpenCLNonbondedUtilities() {
delete exclusionIndex;
if (exclusions != NULL)
delete exclusions;
if (interactingTiles != NULL)
delete interactingTiles;
if (interactionFlags != NULL)
delete interactionFlags;
if (interactionCount != NULL)
delete interactionCount;
if (blockCenter != NULL)
delete blockCenter;
if (blockBoundingBox != NULL)
delete blockBoundingBox;
if (compact != NULL)
delete compact;
}
void OpenCLNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, double cutoffDistance, const std::vector<std::vector<int> >& exclusionList) {
......@@ -90,21 +114,17 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
for (unsigned int x = y; x < numAtomBlocks; x++)
tileVec[count++] = (x << 17) | (y << 2);
// Decide how many force buffers to use.
bool forceBufferPerAtomBlock = false;
numForceBuffers = context.getNumThreadBlocks()*OpenCLContext::ThreadBlockSize/OpenCLContext::TileSize;
if (numForceBuffers >= numAtomBlocks) {
// For small systems, it is more efficient to have one force buffer per block of 32 atoms instead of one per warp.
forceBufferPerAtomBlock = true;
numForceBuffers = numAtomBlocks;
}
// Create kernels.
cl::Program forceProgram = context.createProgram(context.loadSourceFromFile("nonbonded.cl"));
map<string, string> defines;
if (forceBufferPerAtomBlock)
defines["USE_OUTPUT_BUFFER_PER_BLOCK"] = "true";
cl::Program forceProgram = context.createProgram(context.loadSourceFromFile("nonbonded.cl"), defines);
forceKernel = cl::Kernel(forceProgram, "computeNonbonded");
cl::Program interactingBlocksProgram = context.createProgram(context.loadSourceFromFile("findInteractingBlocks.cl"), defines);
findBlockBoundsKernel = cl::Kernel(interactingBlocksProgram, "findBlockBounds");
findInteractingBlocksKernel = cl::Kernel(interactingBlocksProgram, "findBlocksWithInteractions");
findInteractionsWithinBlocksKernel = cl::Kernel(interactingBlocksProgram, "findInteractionsWithinBlocks");
// Mark which tiles have exclusions.
......@@ -182,16 +202,60 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
tiles->upload(tileVec);
exclusions->upload(exclusionVec);
exclusionIndex->upload(exclusionIndexVec);
// Record the periodic box size.
Vec3 boxVectors[3];
system.getPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
periodicBoxSize = (mm_float4) {(float) boxVectors[0][0], (float) boxVectors[1][1], (float) boxVectors[2][2], 0.0f};
// Create data structures for the neighbor list.
if (useCutoff) {
interactingTiles = new OpenCLArray<cl_uint>(context, numTiles, "interactingTiles");
interactionFlags = new OpenCLArray<cl_uint>(context, numTiles, "interactionFlags");
interactionCount = new OpenCLArray<cl_uint>(context, 1, "interactionCount");
blockCenter = new OpenCLArray<mm_float4>(context, numAtomBlocks, "blockCenter");
blockBoundingBox = new OpenCLArray<mm_float4>(context, numAtomBlocks, "blockBoundingBox");
compact = new OpenCLCompact(context);
}
}
void OpenCLNonbondedUtilities::prepareInteractions() {
hasComputedInteractions = false;
if (!useCutoff)
return;
// TODO compute the neighbor list
// Compute the neighbor list.
findBlockBoundsKernel.setArg<cl_int>(0, context.getNumAtoms());
findBlockBoundsKernel.setArg<mm_float4>(1, periodicBoxSize);
findBlockBoundsKernel.setArg<cl::Buffer>(2, context.getPosq().getDeviceBuffer());
findBlockBoundsKernel.setArg<cl::Buffer>(3, blockCenter->getDeviceBuffer());
findBlockBoundsKernel.setArg<cl::Buffer>(4, blockBoundingBox->getDeviceBuffer());
context.executeKernel(findBlockBoundsKernel, context.getNumAtoms());
findInteractingBlocksKernel.setArg<cl_int>(0, tiles->getSize());
findInteractingBlocksKernel.setArg<cl_float>(1, cutoff*cutoff);
findInteractingBlocksKernel.setArg<mm_float4>(2, periodicBoxSize);
findInteractingBlocksKernel.setArg<cl::Buffer>(3, tiles->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(4, blockCenter->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(5, blockBoundingBox->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(6, interactionFlags->getDeviceBuffer());
context.executeKernel(findInteractingBlocksKernel, context.getNumAtoms());
compact->compactStream(*interactingTiles, *tiles, *interactionFlags, *interactionCount);
findInteractionsWithinBlocksKernel.setArg<cl_float>(0, cutoff*cutoff);
findInteractionsWithinBlocksKernel.setArg<mm_float4>(1, periodicBoxSize);
findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(2, context.getPosq().getDeviceBuffer());
findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(3, tiles->getDeviceBuffer());
findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(4, blockCenter->getDeviceBuffer());
findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(5, blockBoundingBox->getDeviceBuffer());
findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(6, interactionFlags->getDeviceBuffer());
findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(7, interactionCount->getDeviceBuffer());
findInteractionsWithinBlocksKernel.setArg(8, OpenCLContext::ThreadBlockSize*sizeof(cl_uint), NULL);
context.executeKernel(findInteractionsWithinBlocksKernel, context.getNumAtoms());
}
void OpenCLNonbondedUtilities::computeInteractions() {
......
......@@ -34,6 +34,8 @@
namespace OpenMM {
class OpenCLCompact;
/**
* This class implements features that are used by several different force. It provides
* a generic interface for calculating nonbonded interactions.
......@@ -86,17 +88,62 @@ public:
* prepareInteractions(). Additional calls return immediately without doing anything.
*/
void computeInteractions();
/**
* Get the array containing the center of each atom block.
*/
OpenCLArray<mm_float4>& getBlockCenters() {
return *blockCenter;
}
/**
* Get the array containing the dimensions of each atom block.
*/
OpenCLArray<mm_float4>& getBlockBoundingBoxes() {
return *blockBoundingBox;
}
/**
* Get the array containing the full set of tiles.
*/
OpenCLArray<cl_uint>& getTiles() {
return *tiles;
}
/**
* Get the array whose first element contains the number of tiles with interactions.
*/
OpenCLArray<cl_uint>& getInteractionCount() {
return *interactionCount;
}
/**
* Get the array containing tiles with interactions.
*/
OpenCLArray<cl_uint>& getInteractingTiles() {
return *interactingTiles;
}
/**
* Get the array containing flags for tiles with interactions.
*/
OpenCLArray<cl_uint>& getInteractionFlags() {
return *interactionFlags;
}
private:
class ParameterInfo;
OpenCLContext& context;
cl::Kernel forceKernel;
cl::Kernel findBlockBoundsKernel;
cl::Kernel findInteractingBlocksKernel;
cl::Kernel findInteractionsWithinBlocksKernel;
OpenCLArray<cl_uint>* tiles;
OpenCLArray<cl_uint>* exclusionIndex;
OpenCLArray<cl_uint>* exclusions;
OpenCLArray<cl_uint>* interactingTiles;
OpenCLArray<cl_uint>* interactionFlags;
OpenCLArray<cl_uint>* interactionCount;
OpenCLArray<mm_float4>* blockCenter;
OpenCLArray<mm_float4>* blockBoundingBox;
std::vector<std::vector<int> > atomExclusions;
std::vector<ParameterInfo> parameters;
OpenCLCompact* compact;
double cutoff;
bool useCutoff, usePeriodic, hasComputedInteractions;
bool useCutoff, usePeriodic, forceBufferPerAtomBlock, hasComputedInteractions;
int numForceBuffers;
mm_float4 periodicBoxSize;
};
......
/* Code for CUDA stream compaction. Roughly based on:
Billeter M, Olsson O, Assarsson U. Efficient Stream Compaction on Wide SIMD Many-Core Architectures.
High Performance Graphics 2009.
Notes:
- paper recommends 128 threads/block, so this is hard coded.
- I only implement the prefix-sum based compact primitive, and not the POPC one, as that is more
complicated and performs poorly on current hardware
- I only implement the scattered- and staged-write variant of phase III as it they have reasonable
performance across most of the tested workloads in the paper. The selective variant is not
implemented.
- The prefix sum of per-block element counts (phase II) is not done in a particularly efficient
manner. It is, however, done in a very easy to program manner, and integrated into the top of
phase III, reducing the number of kernel invocations required. If one wanted to use existing code,
it'd be easy to take the CUDA SDK scanLargeArray sample, and do a prefix sum over dgBlockCounts in
a phase II kernel. You could also adapt the existing prescan128 to take an initial value, and scan
dgBlockCounts in stages.
Date: 23 Aug 2009
Author: CUDA version by Imran Haque (ihaque@cs.stanford.edu), converted to OpenCL by Peter Eastman
Affiliation: Stanford University
License: Public Domain
*/
// Phase 1: Count valid elements per thread block
// Hard-code 128 thd/blk
unsigned int sumReduce128(__local unsigned int* arr) {
// Parallel reduce element counts
// Assumes 128 thd/block
int thread = get_local_id(0);
if (thread < 64) arr[thread] += arr[thread+64];
barrier(CLK_LOCAL_MEM_FENCE);
#ifdef WARPS_ARE_ATOMIC
if (thread < 32) {
arr[thread] += arr[thread+32];
if (thread < 16) arr[thread] += arr[thread+16];
if (thread < 8) arr[thread] += arr[thread+8];
if (thread < 4) arr[thread] += arr[thread+4];
if (thread < 2) arr[thread] += arr[thread+2];
if (thread < 1) arr[thread] += arr[thread+1];
}
#else
if (thread < 32) arr[thread] += arr[thread+32];
barrier(CLK_LOCAL_MEM_FENCE);
if (thread < 16) arr[thread] += arr[thread+16];
barrier(CLK_LOCAL_MEM_FENCE);
if (thread < 8) arr[thread] += arr[thread+8];
barrier(CLK_LOCAL_MEM_FENCE);
if (thread < 4) arr[thread] += arr[thread+4];
barrier(CLK_LOCAL_MEM_FENCE);
if (thread < 2) arr[thread] += arr[thread+2];
barrier(CLK_LOCAL_MEM_FENCE);
if (thread < 1) arr[thread] += arr[thread+1];
#endif
barrier(CLK_LOCAL_MEM_FENCE);
return arr[0];
}
__kernel void countElts(__global unsigned int* dgBlockCounts, __global unsigned int* dgValid, const unsigned int len, __local unsigned int* dsCount) {
dsCount[get_local_id(0)] = 0;
unsigned int ub;
const unsigned int eltsPerBlock = len/get_num_groups(0) + ((len % get_num_groups(0)) ? 1 : 0);
ub = (len < (get_group_id(0)+1)*eltsPerBlock) ? len : ((get_group_id(0) + 1)*eltsPerBlock);
for (int base = get_group_id(0) * eltsPerBlock; base < (get_group_id(0)+1)*eltsPerBlock; base += get_local_size(0)) {
if ((base + get_local_id(0)) < ub && dgValid[base+get_local_id(0)])
dsCount[get_local_id(0)]++;
}
barrier(CLK_LOCAL_MEM_FENCE);
unsigned int blockCount = sumReduce128(dsCount);
if (get_local_id(0) == 0) dgBlockCounts[get_group_id(0)] = blockCount;
return;
}
// Phase 2/3: Move valid elements using SIMD compaction (phase 2 is done implicitly at top of __global__ method)
// Exclusive prefix scan over 128 elements
// Assumes 128 threads
// Taken from cuda SDK "scan" sample for naive scan, with small modifications
int exclusivePrescan128(__local const unsigned int* in, __local unsigned int* outAndTemp) {
const int n=128;
//TODO: this temp storage could be reduced since we write to shared memory in out anyway, and n is hardcoded
//__shared__ int temp[2*n];
__local unsigned int* temp = outAndTemp;
int pout = 1, pin = 0;
// load input into temp
// This is exclusive scan, so shift right by one and set first elt to 0
int thread = get_local_id(0);
temp[pout*n + get_local_id(0)] = (get_local_id(0) > 0) ? in[get_local_id(0)-1] : 0;
barrier(CLK_LOCAL_MEM_FENCE);
for (int offset = 1; offset < n; offset *= 2)
{
pout = 1 - pout; // swap double buffer indices
pin = 1 - pout;
barrier(CLK_LOCAL_MEM_FENCE);
temp[pout*n+get_local_id(0)] = temp[pin*n+get_local_id(0)];
if (get_local_id(0) >= offset)
temp[pout*n+get_local_id(0)] += temp[pin*n+get_local_id(0) - offset];
}
//out[get_local_id(0)] = temp[pout*n+get_local_id(0)]; // write output
barrier(CLK_LOCAL_MEM_FENCE);
return outAndTemp[127]+in[127]; // Return sum of all elements
}
int compactSIMDPrefixSum(__local const unsigned int* dsData, __local const unsigned int* dsValid, __local unsigned int* dsCompact, __local unsigned int* dsLocalIndex) {
int numValid = exclusivePrescan128(dsValid,dsLocalIndex);
int thread = get_local_id(0);
if (dsValid[get_local_id(0)]) dsCompact[dsLocalIndex[get_local_id(0)]] = dsData[get_local_id(0)];
return numValid;
}
__kernel void moveValidElementsStaged(__global unsigned int* dgData, __global unsigned int* dgCompact, __global unsigned int* dgValid,
__global unsigned int* dgBlockCounts, unsigned int len, __global unsigned int* dNumValidElements,
__local unsigned int* inBlock, __local unsigned int* validBlock, __local unsigned int* compactBlock) {
__local unsigned int dsLocalIndex[256];
int blockOutOffset=0;
// Sum up the blockCounts before us to find our offset
// This is totally inefficient - lots of repeated work b/w blocks, and uneven balancing.
// Paper implements this as a prefix sum kernel in phase II
// May still be faster than an extra kernel invocation?
int thread = get_local_id(0);
for (int base = 0; base < get_group_id(0); base += get_local_size(0)) {
// Load up the count of valid elements for each block before us in batches of 128
if ((base + get_local_id(0)) < get_group_id(0)) {
validBlock[get_local_id(0)] = dgBlockCounts[base+get_local_id(0)];
} else {
validBlock[get_local_id(0)] = 0;
}
barrier(CLK_LOCAL_MEM_FENCE);
// Parallel reduce these counts
// Accumulate in the final offset variable
blockOutOffset += sumReduce128(validBlock);
}
unsigned int ub;
const unsigned int eltsPerBlock = len/get_num_groups(0) + ((len % get_num_groups(0)) ? 1 : 0);
ub = (len < (get_group_id(0)+1)*eltsPerBlock) ? len : ((get_group_id(0) + 1)*eltsPerBlock);
for (int base = get_group_id(0) * eltsPerBlock; base < (get_group_id(0)+1)*eltsPerBlock; base += get_local_size(0)) {
if ((base + get_local_id(0)) < ub) {
validBlock[get_local_id(0)] = dgValid[base+get_local_id(0)];
inBlock[get_local_id(0)] = dgData[base+get_local_id(0)];
} else {
validBlock[get_local_id(0)] = 0;
}
barrier(CLK_LOCAL_MEM_FENCE);
int numValidBlock = compactSIMDPrefixSum(inBlock,validBlock,compactBlock,dsLocalIndex);
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < numValidBlock) {
dgCompact[blockOutOffset + get_local_id(0)] = compactBlock[get_local_id(0)];
}
blockOutOffset += numValidBlock;
}
if (get_group_id(0) == (get_num_groups(0)-1) && get_local_id(0) == 0) {
*dNumValidElements = blockOutOffset;
}
}
const int BlockSize = 32;
/**
* Find a bounding box for the atoms in each block.
*/
__kernel void findBlockBounds(int numAtoms, float4 periodicBoxSize, __global float4* posq, __global float4* blockCenter, __global float4* blockBoundingBox) {
int index = get_global_id(0);
int base = index*BlockSize;
while (base < numAtoms) {
float4 pos = posq[base];
#ifdef USE_PERIODIC
pos.x -= floor(pos.x/periodicBoxSize.x)*periodicBoxSize.x;
pos.y -= floor(pos.y/periodicBoxSize.y)*periodicBoxSize.y;
pos.z -= floor(pos.z/periodicBoxSize.z)*periodicBoxSize.z;
float4 firstPoint = pos;
#endif
float4 minPos = pos;
float4 maxPos = pos;
int last = min(base+BlockSize, numAtoms);
for (int i = base+1; i < last; i++) {
pos = posq[i];
#ifdef USE_PERIODIC
pos.x -= floor((pos.x-firstPoint.x)/periodicBoxSize.x+0.5f)*periodicBoxSize.x;
pos.y -= floor((pos.y-firstPoint.y)/periodicBoxSize.y+0.5f)*periodicBoxSize.y;
pos.z -= floor((pos.z-firstPoint.z)/periodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
minPos = min(minPos, pos);
maxPos = max(maxPos, pos);
}
blockBoundingBox[index] = 0.5f*(maxPos-minPos);
blockCenter[index] = 0.5f*(maxPos+minPos);
index += get_global_size(0);
base = index*BlockSize;
}
}
/**
* Compare the bounding boxes for each pair of blocks. If they are sufficiently far apart,
* mark them as non-interacting.
*/
__kernel void findBlocksWithInteractions(int numTiles, float cutoffSquared, float4 periodicBoxSize, __global unsigned int* tiles, __global float4* blockCenter,
__global float4* blockBoundingBox, __global unsigned int* interactionFlag) {
int index = get_global_id(0);
while (index < numTiles) {
// Extract cell coordinates from appropriate work unit
unsigned int x = tiles[index];
unsigned int y = ((x >> 2) & 0x7fff);
x = (x >> 17);
// Find the distance between the bounding boxes of the two cells.
float4 delta = blockCenter[x]-blockCenter[y];
#ifdef USE_PERIODIC
delta.x -= floor(delta.x/periodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y/periodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z/periodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
float4 boxSizea = blockBoundingBox[x];
float4 boxSizeb = blockBoundingBox[y];
delta.x = max(0.0f, fabs(delta.x)-boxSizea.x-boxSizeb.x);
delta.y = max(0.0f, fabs(delta.y)-boxSizea.y-boxSizeb.y);
delta.z = max(0.0f, fabs(delta.z)-boxSizea.z-boxSizeb.z);
interactionFlag[index] = (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z > cutoffSquared ? 0 : 1);
index += get_global_size(0);
}
}
/**
* Compare each atom in one block to the bounding box of another block, and set
* flags for which ones are interacting.
*/
__kernel void findInteractionsWithinBlocks(float cutoffSquared, float4 periodicBoxSize, __global float4* posq, __global unsigned int* tiles, __global float4* blockCenter,
__global float4* blockBoundingBox, __global unsigned int* interactionFlags, __global unsigned int* interactionCount, __local unsigned int* flags) {
unsigned int totalWarps = get_global_size(0)/BlockSize;
unsigned int warp = get_global_id(0)/BlockSize;
unsigned int numTiles = interactionCount[0];
unsigned int pos = warp*numTiles/totalWarps;
unsigned int end = (warp+1)*numTiles/totalWarps;
unsigned int index = get_local_id(0) & (BlockSize - 1);
unsigned int lasty = 0xFFFFFFFF;
float4 apos;
while (pos < end)
{
// Extract cell coordinates from appropriate work unit
unsigned int x = tiles[pos];
unsigned int y = ((x >> 2) & 0x7fff);
bool bExclusionFlag = (x & 0x1);
x = (x >> 17);
if (x == y || bExclusionFlag)
{
// Assume this block will be dense.
if (index == 0)
interactionFlags[pos] = 0xFFFFFFFF;
}
else
{
// Load the bounding box for x and the atom positions for y.
float4 center = blockCenter[x];
float4 boxSize = blockBoundingBox[x];
if (y != lasty)
apos = posq[(y*BlockSize)+index];
// Find the distance of the atom from the bounding box.
float4 delta = apos-center;
#ifdef USE_PERIODIC
delta.x -= floor(delta.x/periodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y/periodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z/periodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
delta = max((float4) 0.0f, fabs(delta)-boxSize);
int thread = get_local_id(0);
flags[thread] = (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z > cutoffSquared ? 0 : 1 << index);
// Sum the flags.
if (index % 2 == 0)
flags[thread] += flags[thread+1];
if (index % 4 == 0)
flags[thread] += flags[thread+2];
if (index % 8 == 0)
flags[thread] += flags[thread+4];
if (index % 16 == 0)
flags[thread] += flags[thread+8];
if (index == 0)
{
unsigned int allFlags = flags[thread] + flags[thread+16];
// Count how many flags are set, and based on that decide whether to compute all interactions
// or only a fraction of them.
unsigned int bits = (allFlags&0x55555555) + ((allFlags>>1)&0x55555555);
bits = (bits&0x33333333) + ((bits>>2)&0x33333333);
bits = (bits&0x0F0F0F0F) + ((bits>>4)&0x0F0F0F0F);
bits = (bits&0x00FF00FF) + ((bits>>8)&0x00FF00FF);
bits = (bits&0x0000FFFF) + ((bits>>16)&0x0000FFFF);
interactionFlags[pos] = (bits > 12 ? 0xFFFFFFFF : allFlags);
}
lasty = y;
}
pos++;
}
}
......@@ -84,16 +84,16 @@ __kernel void computeNonbonded(int numTiles, int paddedNumAtoms, float cutoffSqu
// Write results
float4 of;
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = x + tgx + warp*paddedNumAtoms;
of = forceBuffers[offset];
of.xyz += af.xyz;
forceBuffers[offset] = of;
#else
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
of.xyz = af.xyz;
of.w = 0.0f;
unsigned int offset = x + tgx + (x/TileSize) * paddedNumAtoms;
forceBuffers[offset] = of;
#else
unsigned int offset = x + tgx + warp*paddedNumAtoms;
of = forceBuffers[offset];
of.xyz += af.xyz;
forceBuffers[offset] = of;
#endif
}
else {
......@@ -169,14 +169,14 @@ __kernel void computeNonbonded(int numTiles, int paddedNumAtoms, float cutoffSqu
}
}
}
else // bExclusion
else
#endif
{
// Read fixed atom data into registers and GRF
unsigned int xi = x/TileSize;
unsigned int yi = y/TileSize;
unsigned int tile = xi+yi*paddedNumAtoms/TileSize-yi*(yi+1)/2;
unsigned int excl = exclusions[exclusionIndices[tile]+tgx];
unsigned int excl = (hasExclusions ? exclusions[exclusionIndices[tile]+tgx] : 0xFFFFFFFF);
excl = (excl >> tgx) | (excl << (TileSize - tgx));
for (unsigned int j = 0; j < TileSize; j++) {
bool isExcluded = !(excl & 0x1);
......@@ -222,16 +222,7 @@ __kernel void computeNonbonded(int numTiles, int paddedNumAtoms, float cutoffSqu
// Write results
float4 of;
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = x + tgx + warp*paddedNumAtoms;
of = forceBuffers[offset];
of.xyz += af.xyz;
forceBuffers[offset] = of;
offset = y + tgx + warp*paddedNumAtoms;
of = forceBuffers[offset];
of.xyz += local_foce[get_local_id(0)].xyz;
forceBuffers[offset] = of;
#else
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
of.xyz = af.xyz;
of.w = 0.0f;
unsigned int offset = x + tgx + (y/TileSize) * paddedNumAtoms;
......@@ -239,6 +230,15 @@ __kernel void computeNonbonded(int numTiles, int paddedNumAtoms, float cutoffSqu
of = local_force[get_local_id(0)];
offset = y + tgx + (x/TileSize) * paddedNumAtoms;
forceBuffers[offset] = of;
#else
unsigned int offset = x + tgx + warp*paddedNumAtoms;
of = forceBuffers[offset];
of.xyz += af.xyz;
forceBuffers[offset] = of;
offset = y + tgx + warp*paddedNumAtoms;
of = forceBuffers[offset];
of.xyz += local_force[get_local_id(0)].xyz;
forceBuffers[offset] = of;
#endif
lasty = y;
}
......
......@@ -43,7 +43,8 @@
#include "openmm/LangevinIntegrator.h"
#include "openmm/VerletIntegrator.h"
#include "openmm/internal/ContextImpl.h"
//#include "kernels/gputypes.h"
#include "OpenCLArray.h"
#include "OpenCLNonbondedUtilities.h"
#include "../src/SimTKUtilities/SimTKOpenMMRealType.h"
#include "../src/sfmt/SFMT.h"
#include <iostream>
......@@ -438,167 +439,176 @@ void testLargeSystem() {
ASSERT_EQUAL_TOL(clState.getPotentialEnergy(), referenceState.getPotentialEnergy(), tol);
}
//void testBlockInteractions(bool periodic) {
// const int blockSize = 32;
// const int numBlocks = 100;
// const int numParticles = blockSize*numBlocks;
// const double cutoff = 1.0;
// const double boxSize = (periodic ? 5.1 : 1.1);
// OpenCLPlatform cl;
// System system;
// VerletIntegrator integrator(0.01);
// NonbondedForce* nonbonded = new NonbondedForce();
// vector<Vec3> positions(numParticles);
// init_gen_rand(0);
// for (int i = 0; i < numParticles; i++) {
// system.addParticle(1.0);
// nonbonded->addParticle(1.0, 0.2, 0.2);
// positions[i] = Vec3(boxSize*(3*genrand_real2()-1), boxSize*(3*genrand_real2()-1), boxSize*(3*genrand_real2()-1));
// }
// nonbonded->setNonbondedMethod(periodic ? NonbondedForce::CutoffPeriodic : NonbondedForce::CutoffNonPeriodic);
// nonbonded->setCutoffDistance(cutoff);
// system.setPeriodicBoxVectors(Vec3(boxSize, 0, 0), Vec3(0, boxSize, 0), Vec3(0, 0, boxSize));
// system.addForce(nonbonded);
// Context context(system, integrator, cl);
// context.setPositions(positions);
// State state = context.getState(State::Positions | State::Velocities | State::Forces);
// ContextImpl* contextImpl = *reinterpret_cast<ContextImpl**>(&context);
// OpenCLPlatform::PlatformData& data = *static_cast<OpenCLPlatform::PlatformData*>(contextImpl->getPlatformData());
//
// // Verify that the bounds of each block were calculated correctly.
//
// data.gpu->psPosq4->Download();
// data.gpu->psGridBoundingBox->Download();
// data.gpu->psGridCenter->Download();
// for (int i = 0; i < numBlocks; i++) {
// float4 gridSize = (*data.gpu->psGridBoundingBox)[i];
// float4 center = (*data.gpu->psGridCenter)[i];
// if (periodic) {
// ASSERT(gridSize.x < 0.5*boxSize);
// ASSERT(gridSize.y < 0.5*boxSize);
// ASSERT(gridSize.z < 0.5*boxSize);
// }
// float minx = 0.0, maxx = 0.0, miny = 0.0, maxy = 0.0, minz = 0.0, maxz = 0.0, radius = 0.0;
// for (int j = 0; j < blockSize; j++) {
// float4 pos = (*data.gpu->psPosq4)[i*blockSize+j];
// float dx = pos.x-center.x;
// float dy = pos.y-center.y;
// float dz = pos.z-center.z;
// if (periodic) {
// dx -= (float)(floor(0.5+dx/boxSize)*boxSize);
// dy -= (float)(floor(0.5+dy/boxSize)*boxSize);
// dz -= (float)(floor(0.5+dz/boxSize)*boxSize);
// }
// ASSERT(abs(dx) < gridSize.x+TOL);
// ASSERT(abs(dy) < gridSize.y+TOL);
// ASSERT(abs(dz) < gridSize.z+TOL);
// minx = min(minx, dx);
// maxx = max(maxx, dx);
// miny = min(miny, dy);
// maxy = max(maxy, dy);
// minz = min(minz, dz);
// maxz = max(maxz, dz);
// }
// ASSERT_EQUAL_TOL(-minx, gridSize.x, TOL);
// ASSERT_EQUAL_TOL(maxx, gridSize.x, TOL);
// ASSERT_EQUAL_TOL(-miny, gridSize.y, TOL);
// ASSERT_EQUAL_TOL(maxy, gridSize.y, TOL);
// ASSERT_EQUAL_TOL(-minz, gridSize.z, TOL);
// ASSERT_EQUAL_TOL(maxz, gridSize.z, TOL);
// }
//
// // Verify that interactions were identified correctly.
//
// data.gpu->psInteractionCount->Download();
// int numWithInteractions = (*data.gpu->psInteractionCount)[0];
// vector<bool> hasInteractions(data.gpu->sim.workUnits, false);
// data.gpu->psInteractingWorkUnit->Download();
// data.gpu->psInteractionFlag->Download();
// const unsigned int atoms = data.gpu->sim.paddedNumberOfAtoms;
// const unsigned int grid = data.gpu->grid;
// const unsigned int dim = (atoms+(grid-1))/grid;
// for (int i = 0; i < numWithInteractions; i++) {
// unsigned int workUnit = (*data.gpu->psInteractingWorkUnit)[i];
// unsigned int x = (workUnit >> 17);
// unsigned int y = ((workUnit >> 2) & 0x7fff);
// int tile = (x > y ? x+y*dim-y*(y+1)/2 : y+x*dim-x*(x+1)/2);
// hasInteractions[tile] = true;
//
// // Make sure this tile really should have been flagged based on bounding volumes.
//
// float4 gridSize1 = (*data.gpu->psGridBoundingBox)[x];
// float4 gridSize2 = (*data.gpu->psGridBoundingBox)[y];
// float4 center1 = (*data.gpu->psGridCenter)[x];
// float4 center2 = (*data.gpu->psGridCenter)[y];
// float dx = center1.x-center2.x;
// float dy = center1.y-center2.y;
// float dz = center1.z-center2.z;
// if (periodic) {
// dx -= (float)(floor(0.5+dx/boxSize)*boxSize);
// dy -= (float)(floor(0.5+dy/boxSize)*boxSize);
// dz -= (float)(floor(0.5+dz/boxSize)*boxSize);
// }
// dx = max(0.0f, abs(dx)-gridSize1.x-gridSize2.x);
// dy = max(0.0f, abs(dy)-gridSize1.y-gridSize2.y);
// dz = max(0.0f, abs(dz)-gridSize1.z-gridSize2.z);
// ASSERT(sqrt(dx*dx+dy*dy+dz*dz) < cutoff+TOL);
//
// // Check the interaction flags.
//
// unsigned int flags = (*data.gpu->psInteractionFlag)[i];
// for (int atom2 = 0; atom2 < 32; atom2++) {
// if ((flags & 1) == 0) {
// float4 pos2 = (*data.gpu->psPosq4)[y*blockSize+atom2];
// for (int atom1 = 0; atom1 < blockSize; ++atom1) {
// float4 pos1 = (*data.gpu->psPosq4)[x*blockSize+atom1];
// float dx = pos2.x-pos1.x;
// float dy = pos2.y-pos1.y;
// float dz = pos2.z-pos1.z;
// if (periodic) {
// dx -= (float)(floor(0.5+dx/boxSize)*boxSize);
// dy -= (float)(floor(0.5+dy/boxSize)*boxSize);
// dz -= (float)(floor(0.5+dz/boxSize)*boxSize);
// }
// if (dx*dx+dy*dy+dz*dz < cutoff*cutoff) {
// dx = pos2.x-center1.x;
// dy = pos2.y-center1.y;
// dz = pos2.z-center1.z;
// dx = max(0.0f, abs(dx)-gridSize1.x);
// dy = max(0.0f, abs(dy)-gridSize1.y);
// dz = max(0.0f, abs(dz)-gridSize1.z);
// }
// ASSERT(dx*dx+dy*dy+dz*dz > cutoff*cutoff);
// }
// }
// flags >>= 1;
// }
// }
//
// // Check the tiles that did not have interactions to make sure all atoms are beyond the cutoff.
//
// data.gpu->psWorkUnit->Download();
// for (int i = 0; i < (int)hasInteractions.size(); i++)
// if (!hasInteractions[i]) {
// unsigned int workUnit = (*data.gpu->psWorkUnit)[i];
// unsigned int x = (workUnit >> 17);
// unsigned int y = ((workUnit >> 2) & 0x7fff);
// for (int atom1 = 0; atom1 < blockSize; ++atom1) {
// float4 pos1 = (*data.gpu->psPosq4)[x*blockSize+atom1];
// for (int atom2 = 0; atom2 < blockSize; ++atom2) {
// float4 pos2 = (*data.gpu->psPosq4)[y*blockSize+atom2];
// float dx = pos1.x-pos2.x;
// float dy = pos1.y-pos2.y;
// float dz = pos1.z-pos2.z;
// if (periodic) {
// dx -= (float)(floor(0.5+dx/boxSize)*boxSize);
// dy -= (float)(floor(0.5+dy/boxSize)*boxSize);
// dz -= (float)(floor(0.5+dz/boxSize)*boxSize);
// }
// ASSERT(dx*dx+dy*dy+dz*dz > cutoff*cutoff);
// }
// }
// }
//}
void testBlockInteractions(bool periodic) {
const int blockSize = 32;
const int numBlocks = 100;
const int numParticles = blockSize*numBlocks;
const double cutoff = 1.0;
const double boxSize = (periodic ? 5.1 : 1.1);
OpenCLPlatform cl;
System system;
VerletIntegrator integrator(0.01);
NonbondedForce* nonbonded = new NonbondedForce();
vector<Vec3> positions(numParticles);
init_gen_rand(0);
for (int i = 0; i < numParticles; i++) {
system.addParticle(1.0);
nonbonded->addParticle(1.0, 0.2, 0.2);
positions[i] = Vec3(boxSize*(3*genrand_real2()-1), boxSize*(3*genrand_real2()-1), boxSize*(3*genrand_real2()-1));
}
nonbonded->setNonbondedMethod(periodic ? NonbondedForce::CutoffPeriodic : NonbondedForce::CutoffNonPeriodic);
nonbonded->setCutoffDistance(cutoff);
system.setPeriodicBoxVectors(Vec3(boxSize, 0, 0), Vec3(0, boxSize, 0), Vec3(0, 0, boxSize));
system.addForce(nonbonded);
Context context(system, integrator, cl);
context.setPositions(positions);
State state = context.getState(State::Positions | State::Velocities | State::Forces);
ContextImpl* contextImpl = *reinterpret_cast<ContextImpl**>(&context);
OpenCLPlatform::PlatformData& data = *static_cast<OpenCLPlatform::PlatformData*>(contextImpl->getPlatformData());
OpenCLContext& clcontext = *data.context;
// Verify that the bounds of each block were calculated correctly.
clcontext.getPosq().download();
vector<mm_float4> blockCenters(numBlocks);
vector<mm_float4> blockBoundingBoxes(numBlocks);
OpenCLNonbondedUtilities& nb = clcontext.getNonbondedUtilities();
nb.getBlockCenters().download(blockCenters);
nb.getBlockBoundingBoxes().download(blockBoundingBoxes);
for (int i = 0; i < numBlocks; i++) {
mm_float4 gridSize = blockBoundingBoxes[i];
mm_float4 center = blockCenters[i];
if (periodic) {
ASSERT(gridSize.x < 0.5*boxSize);
ASSERT(gridSize.y < 0.5*boxSize);
ASSERT(gridSize.z < 0.5*boxSize);
}
float minx = 0.0, maxx = 0.0, miny = 0.0, maxy = 0.0, minz = 0.0, maxz = 0.0, radius = 0.0;
for (int j = 0; j < blockSize; j++) {
mm_float4 pos = clcontext.getPosq()[i*blockSize+j];
float dx = pos.x-center.x;
float dy = pos.y-center.y;
float dz = pos.z-center.z;
if (periodic) {
dx -= (float)(floor(0.5+dx/boxSize)*boxSize);
dy -= (float)(floor(0.5+dy/boxSize)*boxSize);
dz -= (float)(floor(0.5+dz/boxSize)*boxSize);
}
ASSERT(abs(dx) < gridSize.x+TOL);
ASSERT(abs(dy) < gridSize.y+TOL);
ASSERT(abs(dz) < gridSize.z+TOL);
minx = min(minx, dx);
maxx = max(maxx, dx);
miny = min(miny, dy);
maxy = max(maxy, dy);
minz = min(minz, dz);
maxz = max(maxz, dz);
}
ASSERT_EQUAL_TOL(-minx, gridSize.x, TOL);
ASSERT_EQUAL_TOL(maxx, gridSize.x, TOL);
ASSERT_EQUAL_TOL(-miny, gridSize.y, TOL);
ASSERT_EQUAL_TOL(maxy, gridSize.y, TOL);
ASSERT_EQUAL_TOL(-minz, gridSize.z, TOL);
ASSERT_EQUAL_TOL(maxz, gridSize.z, TOL);
}
// Verify that interactions were identified correctly.
vector<cl_uint> interactionCount;
vector<cl_uint> interactingTiles;
vector<cl_uint> interactionFlags;
nb.getInteractionCount().download(interactionCount);
int numWithInteractions = interactionCount[0];
vector<bool> hasInteractions(nb.getTiles().getSize(), false);
nb.getInteractingTiles().download(interactingTiles);
nb.getInteractionFlags().download(interactionFlags);
const unsigned int atoms = clcontext.getPaddedNumAtoms();
const unsigned int grid = OpenCLContext::TileSize;
const unsigned int dim = clcontext.getNumAtomBlocks();
printf("%d of %d\n", numWithInteractions, hasInteractions.size());
for (int i = 0; i < numWithInteractions; i++) {
unsigned int tile = interactingTiles[i];
unsigned int x = (tile >> 17);
unsigned int y = ((tile >> 2) & 0x7fff);
int index = (x > y ? x+y*dim-y*(y+1)/2 : y+x*dim-x*(x+1)/2);
hasInteractions[index] = true;
// Make sure this tile really should have been flagged based on bounding volumes.
mm_float4 gridSize1 = blockBoundingBoxes[x];
mm_float4 gridSize2 = blockBoundingBoxes[y];
mm_float4 center1 = blockCenters[x];
mm_float4 center2 = blockCenters[y];
float dx = center1.x-center2.x;
float dy = center1.y-center2.y;
float dz = center1.z-center2.z;
if (periodic) {
dx -= (float)(floor(0.5+dx/boxSize)*boxSize);
dy -= (float)(floor(0.5+dy/boxSize)*boxSize);
dz -= (float)(floor(0.5+dz/boxSize)*boxSize);
}
dx = max(0.0f, abs(dx)-gridSize1.x-gridSize2.x);
dy = max(0.0f, abs(dy)-gridSize1.y-gridSize2.y);
dz = max(0.0f, abs(dz)-gridSize1.z-gridSize2.z);
ASSERT(sqrt(dx*dx+dy*dy+dz*dz) < cutoff+TOL);
// Check the interaction flags.
unsigned int flags = interactionFlags[i];
for (int atom2 = 0; atom2 < 32; atom2++) {
if ((flags & 1) == 0) {
mm_float4 pos2 = clcontext.getPosq()[y*blockSize+atom2];
for (int atom1 = 0; atom1 < blockSize; ++atom1) {
mm_float4 pos1 = clcontext.getPosq()[x*blockSize+atom1];
float dx = pos2.x-pos1.x;
float dy = pos2.y-pos1.y;
float dz = pos2.z-pos1.z;
if (periodic) {
dx -= (float)(floor(0.5+dx/boxSize)*boxSize);
dy -= (float)(floor(0.5+dy/boxSize)*boxSize);
dz -= (float)(floor(0.5+dz/boxSize)*boxSize);
}
if (dx*dx+dy*dy+dz*dz < cutoff*cutoff) {
dx = pos2.x-center1.x;
dy = pos2.y-center1.y;
dz = pos2.z-center1.z;
dx = max(0.0f, abs(dx)-gridSize1.x);
dy = max(0.0f, abs(dy)-gridSize1.y);
dz = max(0.0f, abs(dz)-gridSize1.z);
}
ASSERT(dx*dx+dy*dy+dz*dz > cutoff*cutoff);
}
}
flags >>= 1;
}
}
// Check the tiles that did not have interactions to make sure all atoms are beyond the cutoff.
vector<cl_uint> tiles;
nb.getTiles().download(tiles);
for (int i = 0; i < (int) hasInteractions.size(); i++)
if (!hasInteractions[i]) {
unsigned int tile = tiles[i];
unsigned int x = (tile >> 17);
unsigned int y = ((tile >> 2) & 0x7fff);
for (int atom1 = 0; atom1 < blockSize; ++atom1) {
mm_float4 pos1 = clcontext.getPosq()[x*blockSize+atom1];
for (int atom2 = 0; atom2 < blockSize; ++atom2) {
mm_float4 pos2 = clcontext.getPosq()[y*blockSize+atom2];
float dx = pos1.x-pos2.x;
float dy = pos1.y-pos2.y;
float dz = pos1.z-pos2.z;
if (periodic) {
dx -= (float)(floor(0.5+dx/boxSize)*boxSize);
dy -= (float)(floor(0.5+dy/boxSize)*boxSize);
dz -= (float)(floor(0.5+dz/boxSize)*boxSize);
}
ASSERT(dx*dx+dy*dy+dz*dz > cutoff*cutoff);
}
}
}
}
int main() {
try {
......@@ -609,7 +619,7 @@ int main() {
// testCutoff14();
// testPeriodic();
// testLargeSystem();
// testBlockInteractions(false);
testBlockInteractions(false);
// testBlockInteractions(true);
}
catch(const exception& e) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment