Commit 8a762772 authored by peastman's avatar peastman
Browse files

Merge pull request #1091 from rmcgibbo/osx-opencl

Fix OpenCL platform on low-end devices
parents 14bd3629 8ae2eba5
...@@ -130,7 +130,7 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device ...@@ -130,7 +130,7 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
// This will be less than the wavefront width since it takes several // This will be less than the wavefront width since it takes several
// cycles to execute the full wavefront. // cycles to execute the full wavefront.
// The SIMD instruction width is the VLIW instruction width (or 1 for scalar), // The SIMD instruction width is the VLIW instruction width (or 1 for scalar),
// this is the number of ALUs that can be executing per instruction per thread. // this is the number of ALUs that can be executing per instruction per thread.
devices[i].getInfo<CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD>() * devices[i].getInfo<CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD>() *
devices[i].getInfo<CL_DEVICE_SIMD_WIDTH_AMD>() * devices[i].getInfo<CL_DEVICE_SIMD_WIDTH_AMD>() *
devices[i].getInfo<CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD>(); devices[i].getInfo<CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD>();
...@@ -342,9 +342,9 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device ...@@ -342,9 +342,9 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
compilationDefines["EXP"] = "exp"; compilationDefines["EXP"] = "exp";
compilationDefines["LOG"] = "log"; compilationDefines["LOG"] = "log";
} }
// Set defines for applying periodic boundary conditions. // Set defines for applying periodic boundary conditions.
Vec3 boxVectors[3]; Vec3 boxVectors[3];
system.getDefaultPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]); system.getDefaultPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
boxIsTriclinic = (boxVectors[0][1] != 0.0 || boxVectors[0][2] != 0.0 || boxIsTriclinic = (boxVectors[0][1] != 0.0 || boxVectors[0][2] != 0.0 ||
...@@ -392,11 +392,11 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device ...@@ -392,11 +392,11 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
} }
// Create the work thread used for parallelization when running on multiple devices. // Create the work thread used for parallelization when running on multiple devices.
thread = new WorkThread(); thread = new WorkThread();
// Create utilities objects. // Create utilities objects.
bonded = new OpenCLBondedUtilities(*this); bonded = new OpenCLBondedUtilities(*this);
nonbonded = new OpenCLNonbondedUtilities(*this); nonbonded = new OpenCLNonbondedUtilities(*this);
integration = new OpenCLIntegrationUtilities(*this, system); integration = new OpenCLIntegrationUtilities(*this, system);
...@@ -512,7 +512,7 @@ string OpenCLContext::replaceStrings(const string& input, const std::map<std::st ...@@ -512,7 +512,7 @@ string OpenCLContext::replaceStrings(const string& input, const std::map<std::st
if (index != result.npos) { if (index != result.npos) {
if ((index == 0 || symbolChars.find(result[index-1]) == symbolChars.end()) && (index == result.size()-size || symbolChars.find(result[index+size]) == symbolChars.end())) { if ((index == 0 || symbolChars.find(result[index-1]) == symbolChars.end()) && (index == result.size()-size || symbolChars.find(result[index+size]) == symbolChars.end())) {
// We have found a complete symbol, not part of a longer symbol. // We have found a complete symbol, not part of a longer symbol.
result.replace(index, size, iter->second); result.replace(index, size, iter->second);
index += iter->second.size(); index += iter->second.size();
} }
...@@ -797,7 +797,7 @@ private: ...@@ -797,7 +797,7 @@ private:
void OpenCLContext::findMoleculeGroups() { void OpenCLContext::findMoleculeGroups() {
// The first time this is called, we need to identify all the molecules in the system. // The first time this is called, we need to identify all the molecules in the system.
if (moleculeGroups.size() == 0) { if (moleculeGroups.size() == 0) {
// Add a ForceInfo that makes sure reordering doesn't break virtual sites. // Add a ForceInfo that makes sure reordering doesn't break virtual sites.
...@@ -879,7 +879,7 @@ void OpenCLContext::findMoleculeGroups() { ...@@ -879,7 +879,7 @@ void OpenCLContext::findMoleculeGroups() {
if (!forces[k]->areParticlesIdentical(mol.atoms[i], mol2.atoms[i])) if (!forces[k]->areParticlesIdentical(mol.atoms[i], mol2.atoms[i]))
identical = false; identical = false;
} }
// See if the constraints are identical. // See if the constraints are identical.
for (int i = 0; i < (int) mol.constraints.size() && identical; i++) { for (int i = 0; i < (int) mol.constraints.size() && identical; i++) {
...@@ -960,11 +960,11 @@ void OpenCLContext::invalidateMolecules() { ...@@ -960,11 +960,11 @@ void OpenCLContext::invalidateMolecules() {
} }
if (valid) if (valid)
return; return;
// The list of which molecules are identical is no longer valid. We need to restore the // The list of which molecules are identical is no longer valid. We need to restore the
// atoms to their original order, rebuild the list of identical molecules, and sort them // atoms to their original order, rebuild the list of identical molecules, and sort them
// again. // again.
vector<mm_int4> newCellOffsets(numAtoms); vector<mm_int4> newCellOffsets(numAtoms);
if (useDoublePrecision) { if (useDoublePrecision) {
vector<mm_double4> oldPosq(paddedNumAtoms); vector<mm_double4> oldPosq(paddedNumAtoms);
......
...@@ -186,7 +186,7 @@ static bool compareUshort2(mm_ushort2 a, mm_ushort2 b) { ...@@ -186,7 +186,7 @@ static bool compareUshort2(mm_ushort2 a, mm_ushort2 b) {
void OpenCLNonbondedUtilities::initialize(const System& system) { void OpenCLNonbondedUtilities::initialize(const System& system) {
if (atomExclusions.size() == 0) { if (atomExclusions.size() == 0) {
// No exclusions were specifically requested, so just mark every atom as not interacting with itself. // No exclusions were specifically requested, so just mark every atom as not interacting with itself.
atomExclusions.resize(context.getNumAtoms()); atomExclusions.resize(context.getNumAtoms());
for (int i = 0; i < (int) atomExclusions.size(); i++) for (int i = 0; i < (int) atomExclusions.size(); i++)
atomExclusions[i].push_back(i); atomExclusions[i].push_back(i);
...@@ -199,7 +199,7 @@ void OpenCLNonbondedUtilities::initialize(const System& system) { ...@@ -199,7 +199,7 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
setAtomBlockRange(context.getContextIndex()/(double) numContexts, (context.getContextIndex()+1)/(double) numContexts); setAtomBlockRange(context.getContextIndex()/(double) numContexts, (context.getContextIndex()+1)/(double) numContexts);
// Build a list of tiles that contain exclusions. // Build a list of tiles that contain exclusions.
set<pair<int, int> > tilesWithExclusions; set<pair<int, int> > tilesWithExclusions;
for (int atom1 = 0; atom1 < (int) atomExclusions.size(); ++atom1) { for (int atom1 = 0; atom1 < (int) atomExclusions.size(); ++atom1) {
int x = atom1/OpenCLContext::TileSize; int x = atom1/OpenCLContext::TileSize;
...@@ -410,7 +410,7 @@ void OpenCLNonbondedUtilities::setAtomBlockRange(double startFraction, double en ...@@ -410,7 +410,7 @@ void OpenCLNonbondedUtilities::setAtomBlockRange(double startFraction, double en
numTiles = (int) (endFraction*totalTiles)-startTileIndex; numTiles = (int) (endFraction*totalTiles)-startTileIndex;
if (useCutoff) { if (useCutoff) {
// We are using a cutoff, and the kernels have already been created. // We are using a cutoff, and the kernels have already been created.
for (map<int, KernelSet>::iterator iter = groupKernels.begin(); iter != groupKernels.end(); ++iter) { for (map<int, KernelSet>::iterator iter = groupKernels.begin(); iter != groupKernels.end(); ++iter) {
iter->second.forceKernel.setArg<cl_uint>(5, startTileIndex); iter->second.forceKernel.setArg<cl_uint>(5, startTileIndex);
iter->second.forceKernel.setArg<cl_uint>(6, numTiles); iter->second.forceKernel.setArg<cl_uint>(6, numTiles);
...@@ -491,7 +491,7 @@ void OpenCLNonbondedUtilities::createKernelsForGroups(int groups) { ...@@ -491,7 +491,7 @@ void OpenCLNonbondedUtilities::createKernelsForGroups(int groups) {
kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(18, rebuildNeighborList->getDeviceBuffer()); kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(18, rebuildNeighborList->getDeviceBuffer());
if (kernels.findInteractingBlocksKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice()) < groupSize) { if (kernels.findInteractingBlocksKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice()) < groupSize) {
// The device can't handle this block size, so reduce it. // The device can't handle this block size, so reduce it.
groupSize -= 32; groupSize -= 32;
if (groupSize < 32) if (groupSize < 32)
throw OpenMMException("Failed to create findInteractingBlocks kernel"); throw OpenMMException("Failed to create findInteractingBlocks kernel");
......
...@@ -25,7 +25,7 @@ __kernel void computeNonbonded( ...@@ -25,7 +25,7 @@ __kernel void computeNonbonded(
__global real* restrict energyBuffer, __global const real4* restrict posq, __global const unsigned int* restrict exclusions, __global real* restrict energyBuffer, __global const real4* restrict posq, __global const unsigned int* restrict exclusions,
__global const ushort2* restrict exclusionTiles, unsigned int startTileIndex, unsigned int numTileIndices __global const ushort2* restrict exclusionTiles, unsigned int startTileIndex, unsigned int numTileIndices
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
, __global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, , __global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter,
__global const real4* restrict blockSize, __global const int* restrict interactingAtoms __global const real4* restrict blockSize, __global const int* restrict interactingAtoms
#endif #endif
...@@ -38,7 +38,7 @@ __kernel void computeNonbonded( ...@@ -38,7 +38,7 @@ __kernel void computeNonbonded(
__local AtomData localData[FORCE_WORK_GROUP_SIZE]; __local AtomData localData[FORCE_WORK_GROUP_SIZE];
// First loop: process tiles that contain exclusions. // First loop: process tiles that contain exclusions.
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps; const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps; const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) { for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
...@@ -100,7 +100,7 @@ __kernel void computeNonbonded( ...@@ -100,7 +100,7 @@ __kernel void computeNonbonded(
} }
else { else {
// This is an off-diagonal tile. // This is an off-diagonal tile.
const unsigned int localAtomIndex = get_local_id(0); const unsigned int localAtomIndex = get_local_id(0);
unsigned int j = y*TILE_SIZE + tgx; unsigned int j = y*TILE_SIZE + tgx;
real4 tempPosq = posq[j]; real4 tempPosq = posq[j];
...@@ -126,7 +126,7 @@ __kernel void computeNonbonded( ...@@ -126,7 +126,7 @@ __kernel void computeNonbonded(
#endif #endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef PRUNE_BY_CUTOFF #ifdef PRUNE_BY_CUTOFF
if (r2 < CUTOFF_SQUARED) { if (r2 < MAX_CUTOFF*MAX_CUTOFF) {
#endif #endif
real invR = RSQRT(r2); real invR = RSQRT(r2);
real r = r2*invR; real r = r2*invR;
...@@ -213,7 +213,7 @@ __kernel void computeNonbonded( ...@@ -213,7 +213,7 @@ __kernel void computeNonbonded(
bool includeTile = true; bool includeTile = true;
// Extract the coordinates of this tile. // Extract the coordinates of this tile.
int x, y; int x, y;
bool singlePeriodicCopy = false; bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
...@@ -245,7 +245,7 @@ __kernel void computeNonbonded( ...@@ -245,7 +245,7 @@ __kernel void computeNonbonded(
} }
else else
skipTiles[get_local_id(0)] = end; skipTiles[get_local_id(0)] = end;
skipBase += TILE_SIZE; skipBase += TILE_SIZE;
currentSkipIndex = tbx; currentSkipIndex = tbx;
SYNC_WARPS; SYNC_WARPS;
} }
...@@ -300,7 +300,7 @@ __kernel void computeNonbonded( ...@@ -300,7 +300,7 @@ __kernel void computeNonbonded(
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0); real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef PRUNE_BY_CUTOFF #ifdef PRUNE_BY_CUTOFF
if (r2 < CUTOFF_SQUARED) { if (r2 < MAX_CUTOFF*MAX_CUTOFF) {
#endif #endif
real invR = RSQRT(r2); real invR = RSQRT(r2);
real r = r2*invR; real r = r2*invR;
...@@ -352,7 +352,7 @@ __kernel void computeNonbonded( ...@@ -352,7 +352,7 @@ __kernel void computeNonbonded(
#endif #endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef PRUNE_BY_CUTOFF #ifdef PRUNE_BY_CUTOFF
if (r2 < CUTOFF_SQUARED) { if (r2 < MAX_CUTOFF*MAX_CUTOFF) {
#endif #endif
real invR = RSQRT(r2); real invR = RSQRT(r2);
real r = r2*invR; real r = r2*invR;
......
...@@ -22,7 +22,7 @@ __kernel void computeNonbonded( ...@@ -22,7 +22,7 @@ __kernel void computeNonbonded(
__global real* restrict energyBuffer, __global const real4* restrict posq, __global const unsigned int* restrict exclusions, __global real* restrict energyBuffer, __global const real4* restrict posq, __global const unsigned int* restrict exclusions,
__global const ushort2* restrict exclusionTiles, unsigned int startTileIndex, unsigned int numTileIndices __global const ushort2* restrict exclusionTiles, unsigned int startTileIndex, unsigned int numTileIndices
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
, __global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, , __global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter,
__global const real4* restrict blockSize, __global const int* restrict interactingAtoms __global const real4* restrict blockSize, __global const int* restrict interactingAtoms
#endif #endif
...@@ -31,7 +31,7 @@ __kernel void computeNonbonded( ...@@ -31,7 +31,7 @@ __kernel void computeNonbonded(
__local AtomData localData[TILE_SIZE]; __local AtomData localData[TILE_SIZE];
// First loop: process tiles that contain exclusions. // First loop: process tiles that contain exclusions.
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+get_group_id(0)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0); const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+get_group_id(0)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(get_group_id(0)+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0); const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(get_group_id(0)+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) { for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
...@@ -70,7 +70,7 @@ __kernel void computeNonbonded( ...@@ -70,7 +70,7 @@ __kernel void computeNonbonded(
#endif #endif
real r2 = dot(delta.xyz, delta.xyz); real r2 = dot(delta.xyz, delta.xyz);
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) { if (r2 < MAX_CUTOFF*MAX_CUTOFF) {
#endif #endif
real invR = RSQRT(r2); real invR = RSQRT(r2);
real r = r2*invR; real r = r2*invR;
...@@ -138,7 +138,7 @@ __kernel void computeNonbonded( ...@@ -138,7 +138,7 @@ __kernel void computeNonbonded(
#endif #endif
real r2 = dot(delta.xyz, delta.xyz); real r2 = dot(delta.xyz, delta.xyz);
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) { if (r2 < MAX_CUTOFF*MAX_CUTOFF) {
#endif #endif
real invR = RSQRT(r2); real invR = RSQRT(r2);
real r = r2*invR; real r = r2*invR;
...@@ -228,9 +228,9 @@ __kernel void computeNonbonded( ...@@ -228,9 +228,9 @@ __kernel void computeNonbonded(
while (pos < end) { while (pos < end) {
const bool hasExclusions = false; const bool hasExclusions = false;
bool includeTile = true; bool includeTile = true;
// Extract the coordinates of this tile. // Extract the coordinates of this tile.
int x, y; int x, y;
bool singlePeriodicCopy = false; bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
...@@ -304,7 +304,7 @@ __kernel void computeNonbonded( ...@@ -304,7 +304,7 @@ __kernel void computeNonbonded(
real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q); real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0); real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
real r2 = dot(delta.xyz, delta.xyz); real r2 = dot(delta.xyz, delta.xyz);
if (r2 < CUTOFF_SQUARED) { if (r2 < MAX_CUTOFF*MAX_CUTOFF) {
real invR = RSQRT(r2); real invR = RSQRT(r2);
real r = r2*invR; real r = r2*invR;
unsigned int atom2 = j; unsigned int atom2 = j;
...@@ -367,7 +367,7 @@ __kernel void computeNonbonded( ...@@ -367,7 +367,7 @@ __kernel void computeNonbonded(
#endif #endif
real r2 = dot(delta.xyz, delta.xyz); real r2 = dot(delta.xyz, delta.xyz);
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) { if (r2 < MAX_CUTOFF*MAX_CUTOFF) {
#endif #endif
real invR = RSQRT(r2); real invR = RSQRT(r2);
real r = r2*invR; real r = r2*invR;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment