Commit a993d7ab authored by Peter Eastman's avatar Peter Eastman
Browse files

Merge remote-tracking branch 'origin/master' into charmm

parents fd76052f fcba92a6
...@@ -190,7 +190,7 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc ...@@ -190,7 +190,7 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
// Extract the coordinates of this tile. // Extract the coordinates of this tile.
unsigned int x, y; int x, y;
bool singlePeriodicCopy = false; bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (numTiles <= maxTiles) { if (numTiles <= maxTiles) {
...@@ -203,7 +203,7 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc ...@@ -203,7 +203,7 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
else else
#endif #endif
{ {
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos)); y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2); x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error. if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1); y += (x < y ? -1 : 1);
......
...@@ -166,7 +166,7 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const ...@@ -166,7 +166,7 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
// Extract the coordinates of this tile. // Extract the coordinates of this tile.
unsigned int x, y; int x, y;
bool singlePeriodicCopy = false; bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (numTiles <= maxTiles) { if (numTiles <= maxTiles) {
...@@ -179,7 +179,7 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const ...@@ -179,7 +179,7 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
else else
#endif #endif
{ {
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos)); y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2); x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error. if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1); y += (x < y ? -1 : 1);
......
...@@ -226,7 +226,7 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa ...@@ -226,7 +226,7 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
// Extract the coordinates of this tile. // Extract the coordinates of this tile.
unsigned int x, y; int x, y;
bool singlePeriodicCopy = false; bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (numTiles <= maxTiles) { if (numTiles <= maxTiles) {
...@@ -239,7 +239,7 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa ...@@ -239,7 +239,7 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
else else
#endif #endif
{ {
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos)); y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2); x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error. if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1); y += (x < y ? -1 : 1);
...@@ -590,7 +590,7 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo ...@@ -590,7 +590,7 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
// Extract the coordinates of this tile. // Extract the coordinates of this tile.
unsigned int x, y; int x, y;
bool singlePeriodicCopy = false; bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (numTiles <= maxTiles) { if (numTiles <= maxTiles) {
...@@ -603,7 +603,7 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo ...@@ -603,7 +603,7 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
else else
#endif #endif
{ {
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos)); y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2); x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error. if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1); y += (x < y ? -1 : 1);
......
...@@ -333,7 +333,7 @@ extern "C" __global__ void computeNonbonded( ...@@ -333,7 +333,7 @@ extern "C" __global__ void computeNonbonded(
bool includeTile = true; bool includeTile = true;
// Extract the coordinates of this tile. // Extract the coordinates of this tile.
unsigned int x, y; int x, y;
bool singlePeriodicCopy = false; bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (numTiles <= maxTiles) { if (numTiles <= maxTiles) {
...@@ -346,7 +346,7 @@ extern "C" __global__ void computeNonbonded( ...@@ -346,7 +346,7 @@ extern "C" __global__ void computeNonbonded(
else else
#endif #endif
{ {
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos)); y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2); x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error. if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1); y += (x < y ? -1 : 1);
......
...@@ -292,6 +292,7 @@ void testWithThermostat() { ...@@ -292,6 +292,7 @@ void testWithThermostat() {
system.addForce(forceField); system.addForce(forceField);
AndersenThermostat* thermostat = new AndersenThermostat(temp, collisionFreq); AndersenThermostat* thermostat = new AndersenThermostat(temp, collisionFreq);
system.addForce(thermostat); system.addForce(thermostat);
integrator.setRandomNumberSeed(thermostat->getRandomNumberSeed());
Context context(system, integrator, platform); Context context(system, integrator, platform);
vector<Vec3> positions(numParticles); vector<Vec3> positions(numParticles);
for (int i = 0; i < numParticles; ++i) for (int i = 0; i < numParticles; ++i)
...@@ -424,6 +425,7 @@ void testParameter() { ...@@ -424,6 +425,7 @@ void testParameter() {
integrator.addGlobalVariable("temp", 0); integrator.addGlobalVariable("temp", 0);
integrator.addComputeGlobal("temp", "AndersenTemperature"); integrator.addComputeGlobal("temp", "AndersenTemperature");
integrator.addComputeGlobal("AndersenTemperature", "temp*2"); integrator.addComputeGlobal("AndersenTemperature", "temp*2");
integrator.setRandomNumberSeed(thermostat->getRandomNumberSeed());
Context context(system, integrator, platform); Context context(system, integrator, platform);
// See if the parameter is being used correctly. // See if the parameter is being used correctly.
......
...@@ -389,7 +389,7 @@ int main(int argc, char* argv[]) { ...@@ -389,7 +389,7 @@ int main(int argc, char* argv[]) {
testIdealGasAxis(1); testIdealGasAxis(1);
testIdealGasAxis(2); testIdealGasAxis(2);
testRandomSeed(); testRandomSeed();
testEinsteinCrystal(); //testEinsteinCrystal();
} }
catch(const exception& e) { catch(const exception& e) {
cout << "exception: " << e.what() << endl; cout << "exception: " << e.what() << endl;
......
...@@ -284,7 +284,8 @@ private: ...@@ -284,7 +284,8 @@ private:
std::map<std::string, std::string> kernelDefines; std::map<std::string, std::string> kernelDefines;
double cutoff; double cutoff;
bool useCutoff, usePeriodic, deviceIsCpu, anyExclusions, usePadding; bool useCutoff, usePeriodic, deviceIsCpu, anyExclusions, usePadding;
int numForceBuffers, startTileIndex, numTiles, startBlockIndex, numBlocks, numForceThreadBlocks, forceThreadBlockSize, nonbondedForceGroup; int numForceBuffers, startTileIndex, numTiles, startBlockIndex, numBlocks, numForceThreadBlocks;
int forceThreadBlockSize, interactingBlocksThreadBlockSize, nonbondedForceGroup;
}; };
/** /**
......
This diff is collapsed.
...@@ -4820,7 +4820,7 @@ void OpenCLIntegrateVariableVerletStepKernel::initialize(const System& system, c ...@@ -4820,7 +4820,7 @@ void OpenCLIntegrateVariableVerletStepKernel::initialize(const System& system, c
kernel1 = cl::Kernel(program, "integrateVerletPart1"); kernel1 = cl::Kernel(program, "integrateVerletPart1");
kernel2 = cl::Kernel(program, "integrateVerletPart2"); kernel2 = cl::Kernel(program, "integrateVerletPart2");
selectSizeKernel = cl::Kernel(program, "selectVerletStepSize"); selectSizeKernel = cl::Kernel(program, "selectVerletStepSize");
blockSize = min(min(256, system.getNumParticles()), (int) cl.getDevice().getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>()); blockSize = min(min(256, system.getNumParticles()), (int) selectSizeKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(cl.getDevice()));
} }
double OpenCLIntegrateVariableVerletStepKernel::execute(ContextImpl& context, const VariableVerletIntegrator& integrator, double maxTime) { double OpenCLIntegrateVariableVerletStepKernel::execute(ContextImpl& context, const VariableVerletIntegrator& integrator, double maxTime) {
...@@ -4930,7 +4930,7 @@ void OpenCLIntegrateVariableLangevinStepKernel::initialize(const System& system, ...@@ -4930,7 +4930,7 @@ void OpenCLIntegrateVariableLangevinStepKernel::initialize(const System& system,
params = new OpenCLArray(cl, 3, cl.getUseDoublePrecision() || cl.getUseMixedPrecision() ? sizeof(cl_double) : sizeof(cl_float), "langevinParams"); params = new OpenCLArray(cl, 3, cl.getUseDoublePrecision() || cl.getUseMixedPrecision() ? sizeof(cl_double) : sizeof(cl_float), "langevinParams");
blockSize = min(256, system.getNumParticles()); blockSize = min(256, system.getNumParticles());
blockSize = max(blockSize, params->getSize()); blockSize = max(blockSize, params->getSize());
blockSize = min(blockSize, (int) cl.getDevice().getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>()); blockSize = min(blockSize, (int) selectSizeKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(cl.getDevice()));
} }
double OpenCLIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, const VariableLangevinIntegrator& integrator, double maxTime) { double OpenCLIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, const VariableLangevinIntegrator& integrator, double maxTime) {
......
...@@ -317,42 +317,55 @@ void OpenCLNonbondedUtilities::initialize(const System& system) { ...@@ -317,42 +317,55 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
for (int i = 0; i < (int) exclusionBlocksForBlock.size(); i++) for (int i = 0; i < (int) exclusionBlocksForBlock.size(); i++)
maxExclusions = (maxExclusions > exclusionBlocksForBlock[i].size() ? maxExclusions : exclusionBlocksForBlock[i].size()); maxExclusions = (maxExclusions > exclusionBlocksForBlock[i].size() ? maxExclusions : exclusionBlocksForBlock[i].size());
defines["MAX_EXCLUSIONS"] = context.intToString(maxExclusions); defines["MAX_EXCLUSIONS"] = context.intToString(maxExclusions);
defines["GROUP_SIZE"] = (deviceIsCpu ? "32" : "128");
defines["BUFFER_GROUPS"] = (deviceIsCpu ? "4" : "2"); defines["BUFFER_GROUPS"] = (deviceIsCpu ? "4" : "2");
string file = (deviceIsCpu ? OpenCLKernelSources::findInteractingBlocks_cpu : OpenCLKernelSources::findInteractingBlocks); string file = (deviceIsCpu ? OpenCLKernelSources::findInteractingBlocks_cpu : OpenCLKernelSources::findInteractingBlocks);
cl::Program interactingBlocksProgram = context.createProgram(file, defines); int groupSize = (deviceIsCpu ? 32 : 128);
findBlockBoundsKernel = cl::Kernel(interactingBlocksProgram, "findBlockBounds"); while (true) {
findBlockBoundsKernel.setArg<cl_int>(0, context.getNumAtoms()); defines["GROUP_SIZE"] = context.intToString(groupSize);
findBlockBoundsKernel.setArg<cl::Buffer>(3, context.getPosq().getDeviceBuffer()); cl::Program interactingBlocksProgram = context.createProgram(file, defines);
findBlockBoundsKernel.setArg<cl::Buffer>(4, blockCenter->getDeviceBuffer()); findBlockBoundsKernel = cl::Kernel(interactingBlocksProgram, "findBlockBounds");
findBlockBoundsKernel.setArg<cl::Buffer>(5, blockBoundingBox->getDeviceBuffer()); findBlockBoundsKernel.setArg<cl_int>(0, context.getNumAtoms());
findBlockBoundsKernel.setArg<cl::Buffer>(6, rebuildNeighborList->getDeviceBuffer()); findBlockBoundsKernel.setArg<cl::Buffer>(3, context.getPosq().getDeviceBuffer());
findBlockBoundsKernel.setArg<cl::Buffer>(7, sortedBlocks->getDeviceBuffer()); findBlockBoundsKernel.setArg<cl::Buffer>(4, blockCenter->getDeviceBuffer());
sortBoxDataKernel = cl::Kernel(interactingBlocksProgram, "sortBoxData"); findBlockBoundsKernel.setArg<cl::Buffer>(5, blockBoundingBox->getDeviceBuffer());
sortBoxDataKernel.setArg<cl::Buffer>(0, sortedBlocks->getDeviceBuffer()); findBlockBoundsKernel.setArg<cl::Buffer>(6, rebuildNeighborList->getDeviceBuffer());
sortBoxDataKernel.setArg<cl::Buffer>(1, blockCenter->getDeviceBuffer()); findBlockBoundsKernel.setArg<cl::Buffer>(7, sortedBlocks->getDeviceBuffer());
sortBoxDataKernel.setArg<cl::Buffer>(2, blockBoundingBox->getDeviceBuffer()); sortBoxDataKernel = cl::Kernel(interactingBlocksProgram, "sortBoxData");
sortBoxDataKernel.setArg<cl::Buffer>(3, sortedBlockCenter->getDeviceBuffer()); sortBoxDataKernel.setArg<cl::Buffer>(0, sortedBlocks->getDeviceBuffer());
sortBoxDataKernel.setArg<cl::Buffer>(4, sortedBlockBoundingBox->getDeviceBuffer()); sortBoxDataKernel.setArg<cl::Buffer>(1, blockCenter->getDeviceBuffer());
sortBoxDataKernel.setArg<cl::Buffer>(5, context.getPosq().getDeviceBuffer()); sortBoxDataKernel.setArg<cl::Buffer>(2, blockBoundingBox->getDeviceBuffer());
sortBoxDataKernel.setArg<cl::Buffer>(6, oldPositions->getDeviceBuffer()); sortBoxDataKernel.setArg<cl::Buffer>(3, sortedBlockCenter->getDeviceBuffer());
sortBoxDataKernel.setArg<cl::Buffer>(7, interactionCount->getDeviceBuffer()); sortBoxDataKernel.setArg<cl::Buffer>(4, sortedBlockBoundingBox->getDeviceBuffer());
sortBoxDataKernel.setArg<cl::Buffer>(8, rebuildNeighborList->getDeviceBuffer()); sortBoxDataKernel.setArg<cl::Buffer>(5, context.getPosq().getDeviceBuffer());
findInteractingBlocksKernel = cl::Kernel(interactingBlocksProgram, "findBlocksWithInteractions"); sortBoxDataKernel.setArg<cl::Buffer>(6, oldPositions->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(2, interactionCount->getDeviceBuffer()); sortBoxDataKernel.setArg<cl::Buffer>(7, interactionCount->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(3, interactingTiles->getDeviceBuffer()); sortBoxDataKernel.setArg<cl::Buffer>(8, rebuildNeighborList->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(4, interactingAtoms->getDeviceBuffer()); findInteractingBlocksKernel = cl::Kernel(interactingBlocksProgram, "findBlocksWithInteractions");
findInteractingBlocksKernel.setArg<cl::Buffer>(5, context.getPosq().getDeviceBuffer()); findInteractingBlocksKernel.setArg<cl::Buffer>(2, interactionCount->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl_uint>(6, interactingTiles->getSize()); findInteractingBlocksKernel.setArg<cl::Buffer>(3, interactingTiles->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl_uint>(7, startBlockIndex); findInteractingBlocksKernel.setArg<cl::Buffer>(4, interactingAtoms->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl_uint>(8, numBlocks); findInteractingBlocksKernel.setArg<cl::Buffer>(5, context.getPosq().getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(9, sortedBlocks->getDeviceBuffer()); findInteractingBlocksKernel.setArg<cl_uint>(6, interactingTiles->getSize());
findInteractingBlocksKernel.setArg<cl::Buffer>(10, sortedBlockCenter->getDeviceBuffer()); findInteractingBlocksKernel.setArg<cl_uint>(7, startBlockIndex);
findInteractingBlocksKernel.setArg<cl::Buffer>(11, sortedBlockBoundingBox->getDeviceBuffer()); findInteractingBlocksKernel.setArg<cl_uint>(8, numBlocks);
findInteractingBlocksKernel.setArg<cl::Buffer>(12, exclusionIndices->getDeviceBuffer()); findInteractingBlocksKernel.setArg<cl::Buffer>(9, sortedBlocks->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(13, exclusionRowIndices->getDeviceBuffer()); findInteractingBlocksKernel.setArg<cl::Buffer>(10, sortedBlockCenter->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(14, oldPositions->getDeviceBuffer()); findInteractingBlocksKernel.setArg<cl::Buffer>(11, sortedBlockBoundingBox->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(15, rebuildNeighborList->getDeviceBuffer()); findInteractingBlocksKernel.setArg<cl::Buffer>(12, exclusionIndices->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(13, exclusionRowIndices->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(14, oldPositions->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(15, rebuildNeighborList->getDeviceBuffer());
if (findInteractingBlocksKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice()) < groupSize) {
// The device can't handle this block size, so reduce it.
groupSize -= 32;
if (groupSize < 32)
throw OpenMMException("Failed to create findInteractingBlocks kernel");
continue;
}
break;
}
interactingBlocksThreadBlockSize = (deviceIsCpu ? 1 : groupSize);
} }
} }
...@@ -389,7 +402,7 @@ void OpenCLNonbondedUtilities::prepareInteractions() { ...@@ -389,7 +402,7 @@ void OpenCLNonbondedUtilities::prepareInteractions() {
context.executeKernel(sortBoxDataKernel, context.getNumAtoms()); context.executeKernel(sortBoxDataKernel, context.getNumAtoms());
setPeriodicBoxSizeArg(context, findInteractingBlocksKernel, 0); setPeriodicBoxSizeArg(context, findInteractingBlocksKernel, 0);
setInvPeriodicBoxSizeArg(context, findInteractingBlocksKernel, 1); setInvPeriodicBoxSizeArg(context, findInteractingBlocksKernel, 1);
context.executeKernel(findInteractingBlocksKernel, context.getNumAtoms(), deviceIsCpu ? 1 : 128); context.executeKernel(findInteractingBlocksKernel, context.getNumAtoms(), interactingBlocksThreadBlockSize);
} }
void OpenCLNonbondedUtilities::computeInteractions() { void OpenCLNonbondedUtilities::computeInteractions() {
......
...@@ -32,6 +32,7 @@ ...@@ -32,6 +32,7 @@
#include "openmm/Context.h" #include "openmm/Context.h"
#include "openmm/System.h" #include "openmm/System.h"
#include <algorithm> #include <algorithm>
#include <cctype>
#include <sstream> #include <sstream>
#ifdef __APPLE__ #ifdef __APPLE__
#include "sys/sysctl.h" #include "sys/sysctl.h"
...@@ -39,10 +40,7 @@ ...@@ -39,10 +40,7 @@
using namespace OpenMM; using namespace OpenMM;
using std::map; using namespace std;
using std::string;
using std::stringstream;
using std::vector;
#ifdef OPENMM_OPENCL_BUILDING_STATIC_LIBRARY #ifdef OPENMM_OPENCL_BUILDING_STATIC_LIBRARY
extern "C" void registerOpenCLPlatform() { extern "C" void registerOpenCLPlatform() {
......
...@@ -56,10 +56,13 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le ...@@ -56,10 +56,13 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
unsigned int maxGroupSize = std::min(256, (int) context.getDevice().getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>()); unsigned int maxGroupSize = std::min(256, (int) context.getDevice().getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>());
int maxSharedMem = context.getDevice().getInfo<CL_DEVICE_LOCAL_MEM_SIZE>(); int maxSharedMem = context.getDevice().getInfo<CL_DEVICE_LOCAL_MEM_SIZE>();
unsigned int maxLocalBuffer = (unsigned int) ((maxSharedMem/trait->getDataSize())/2); unsigned int maxLocalBuffer = (unsigned int) ((maxSharedMem/trait->getDataSize())/2);
isShortList = (length <= maxLocalBuffer); unsigned int maxRangeSize = std::min(maxGroupSize, (unsigned int) computeRangeKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice()));
for (rangeKernelSize = 1; rangeKernelSize*2 <= maxGroupSize; rangeKernelSize *= 2) unsigned int maxPositionsSize = std::min(maxGroupSize, (unsigned int) computeBucketPositionsKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice()));
unsigned int maxShortListSize = shortListKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice());
isShortList = (length <= maxLocalBuffer && length < maxShortListSize);
for (rangeKernelSize = 1; rangeKernelSize*2 <= maxRangeSize; rangeKernelSize *= 2)
; ;
positionsKernelSize = rangeKernelSize; positionsKernelSize = std::min(rangeKernelSize, maxPositionsSize);
sortKernelSize = (isShortList ? rangeKernelSize : rangeKernelSize/2); sortKernelSize = (isShortList ? rangeKernelSize : rangeKernelSize/2);
if (rangeKernelSize > length) if (rangeKernelSize > length)
rangeKernelSize = length; rangeKernelSize = length;
......
...@@ -200,7 +200,7 @@ __kernel void computeN2Energy( ...@@ -200,7 +200,7 @@ __kernel void computeN2Energy(
// Extract the coordinates of this tile. // Extract the coordinates of this tile.
unsigned int x, y; int x, y;
bool singlePeriodicCopy = false; bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (numTiles <= maxTiles) { if (numTiles <= maxTiles) {
...@@ -213,7 +213,7 @@ __kernel void computeN2Energy( ...@@ -213,7 +213,7 @@ __kernel void computeN2Energy(
else else
#endif #endif
{ {
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos)); y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2); x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error. if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1); y += (x < y ? -1 : 1);
......
...@@ -216,7 +216,7 @@ __kernel void computeN2Energy( ...@@ -216,7 +216,7 @@ __kernel void computeN2Energy(
// Extract the coordinates of this tile. // Extract the coordinates of this tile.
unsigned int x, y; int x, y;
bool singlePeriodicCopy = false; bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (numTiles <= maxTiles) { if (numTiles <= maxTiles) {
...@@ -229,7 +229,7 @@ __kernel void computeN2Energy( ...@@ -229,7 +229,7 @@ __kernel void computeN2Energy(
else else
#endif #endif
{ {
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos)); y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2); x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error. if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1); y += (x < y ? -1 : 1);
......
...@@ -174,7 +174,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4* ...@@ -174,7 +174,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
// Extract the coordinates of this tile. // Extract the coordinates of this tile.
unsigned int x, y; int x, y;
bool singlePeriodicCopy = false; bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (numTiles <= maxTiles) { if (numTiles <= maxTiles) {
...@@ -187,7 +187,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4* ...@@ -187,7 +187,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
else else
#endif #endif
{ {
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos)); y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2); x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error. if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1); y += (x < y ? -1 : 1);
......
...@@ -184,7 +184,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4* ...@@ -184,7 +184,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
// Extract the coordinates of this tile. // Extract the coordinates of this tile.
unsigned int x, y; int x, y;
bool singlePeriodicCopy = false; bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (numTiles <= maxTiles) { if (numTiles <= maxTiles) {
...@@ -197,7 +197,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4* ...@@ -197,7 +197,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
else else
#endif #endif
{ {
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos)); y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2); x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error. if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1); y += (x < y ? -1 : 1);
......
...@@ -186,7 +186,7 @@ __kernel void computeBornSum( ...@@ -186,7 +186,7 @@ __kernel void computeBornSum(
// Extract the coordinates of this tile. // Extract the coordinates of this tile.
unsigned int x, y; int x, y;
bool singlePeriodicCopy = false; bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (numTiles <= maxTiles) { if (numTiles <= maxTiles) {
...@@ -199,7 +199,7 @@ __kernel void computeBornSum( ...@@ -199,7 +199,7 @@ __kernel void computeBornSum(
else else
#endif #endif
{ {
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos)); y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2); x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error. if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1); y += (x < y ? -1 : 1);
...@@ -574,7 +574,7 @@ __kernel void computeGBSAForce1( ...@@ -574,7 +574,7 @@ __kernel void computeGBSAForce1(
// Extract the coordinates of this tile. // Extract the coordinates of this tile.
unsigned int x, y; int x, y;
bool singlePeriodicCopy = false; bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (numTiles <= maxTiles) { if (numTiles <= maxTiles) {
...@@ -587,7 +587,7 @@ __kernel void computeGBSAForce1( ...@@ -587,7 +587,7 @@ __kernel void computeGBSAForce1(
else else
#endif #endif
{ {
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos)); y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2); x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error. if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1); y += (x < y ? -1 : 1);
......
...@@ -192,7 +192,7 @@ __kernel void computeBornSum( ...@@ -192,7 +192,7 @@ __kernel void computeBornSum(
// Extract the coordinates of this tile. // Extract the coordinates of this tile.
unsigned int x, y; int x, y;
bool singlePeriodicCopy = false; bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (numTiles <= maxTiles) { if (numTiles <= maxTiles) {
...@@ -205,7 +205,7 @@ __kernel void computeBornSum( ...@@ -205,7 +205,7 @@ __kernel void computeBornSum(
else else
#endif #endif
{ {
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos)); y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2); x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error. if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1); y += (x < y ? -1 : 1);
...@@ -607,7 +607,7 @@ __kernel void computeGBSAForce1( ...@@ -607,7 +607,7 @@ __kernel void computeGBSAForce1(
// Extract the coordinates of this tile. // Extract the coordinates of this tile.
unsigned int x, y; int x, y;
bool singlePeriodicCopy = false; bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (numTiles <= maxTiles) { if (numTiles <= maxTiles) {
...@@ -620,7 +620,7 @@ __kernel void computeGBSAForce1( ...@@ -620,7 +620,7 @@ __kernel void computeGBSAForce1(
else else
#endif #endif
{ {
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos)); y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2); x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error. if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1); y += (x < y ? -1 : 1);
......
...@@ -213,7 +213,7 @@ __kernel void computeNonbonded( ...@@ -213,7 +213,7 @@ __kernel void computeNonbonded(
// Extract the coordinates of this tile. // Extract the coordinates of this tile.
unsigned int x, y; int x, y;
bool singlePeriodicCopy = false; bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (numTiles <= maxTiles) { if (numTiles <= maxTiles) {
...@@ -226,7 +226,7 @@ __kernel void computeNonbonded( ...@@ -226,7 +226,7 @@ __kernel void computeNonbonded(
else else
#endif #endif
{ {
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos)); y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2); x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error. if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1); y += (x < y ? -1 : 1);
......
...@@ -230,7 +230,7 @@ __kernel void computeNonbonded( ...@@ -230,7 +230,7 @@ __kernel void computeNonbonded(
// Extract the coordinates of this tile. // Extract the coordinates of this tile.
unsigned int x, y; int x, y;
bool singlePeriodicCopy = false; bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (numTiles <= maxTiles) { if (numTiles <= maxTiles) {
...@@ -243,7 +243,7 @@ __kernel void computeNonbonded( ...@@ -243,7 +243,7 @@ __kernel void computeNonbonded(
else else
#endif #endif
{ {
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos)); y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2); x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error. if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1); y += (x < y ? -1 : 1);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment