Merge pull request #1091 from rmcgibbo/osx-opencl

Fix OpenCL platform on low-end devices

Merge pull request #1091 from rmcgibbo/osx-opencl
Fix OpenCL platform on low-end devices
8a762772 · peastman · 14bd3629 · 8ae2eba5 · 8a762772 · 8a762772
Commit 8a762772 authored Aug 12, 2015 by peastman
4 changed files
--- a/platforms/opencl/src/OpenCLContext.cpp
+++ b/platforms/opencl/src/OpenCLContext.cpp
@@ -130,7 +130,7 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
                            // This will be less than the wavefront width since it takes several
                            // cycles to execute the full wavefront.
                            // The SIMD instruction width is the VLIW instruction width (or 1 for scalar),
-                            // this is the number of ALUs that can be executing per instruction per thread. 
+                            // this is the number of ALUs that can be executing per instruction per thread.
                            devices[i].getInfo<CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD>() *
                            devices[i].getInfo<CL_DEVICE_SIMD_WIDTH_AMD>() *
                            devices[i].getInfo<CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD>();
@@ -342,9 +342,9 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
        compilationDefines["EXP"] = "exp";
        compilationDefines["LOG"] = "log";
    }
-    
+
    // Set defines for applying periodic boundary conditions.
-    
+
    Vec3 boxVectors[3];
    system.getDefaultPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
    boxIsTriclinic = (boxVectors[0][1] != 0.0 || boxVectors[0][2] != 0.0 ||
@@ -392,11 +392,11 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
    }

    // Create the work thread used for parallelization when running on multiple devices.
-    
+
    thread = new WorkThread();
-    
+
    // Create utilities objects.
-    
+
    bonded = new OpenCLBondedUtilities(*this);
    nonbonded = new OpenCLNonbondedUtilities(*this);
    integration = new OpenCLIntegrationUtilities(*this, system);
@@ -512,7 +512,7 @@ string OpenCLContext::replaceStrings(const string& input, const std::map<std::st
            if (index != result.npos) {
                if ((index == 0 || symbolChars.find(result[index-1]) == symbolChars.end()) && (index == result.size()-size || symbolChars.find(result[index+size]) == symbolChars.end())) {
                    // We have found a complete symbol, not part of a longer symbol.
-                    
+
                    result.replace(index, size, iter->second);
                    index += iter->second.size();
                }
@@ -797,7 +797,7 @@ private:

 void OpenCLContext::findMoleculeGroups() {
    // The first time this is called, we need to identify all the molecules in the system.
-    
+
    if (moleculeGroups.size() == 0) {
        // Add a ForceInfo that makes sure reordering doesn't break virtual sites.

@@ -879,7 +879,7 @@ void OpenCLContext::findMoleculeGroups() {
                    if (!forces[k]->areParticlesIdentical(mol.atoms[i], mol2.atoms[i]))
                        identical = false;
            }
-            
+
            // See if the constraints are identical.

            for (int i = 0; i < (int) mol.constraints.size() && identical; i++) {
@@ -960,11 +960,11 @@ void OpenCLContext::invalidateMolecules() {
    }
    if (valid)
        return;
-    
+
    // The list of which molecules are identical is no longer valid.  We need to restore the
    // atoms to their original order, rebuild the list of identical molecules, and sort them
    // again.
-    
+
    vector<mm_int4> newCellOffsets(numAtoms);
    if (useDoublePrecision) {
        vector<mm_double4> oldPosq(paddedNumAtoms);

--- a/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
+++ b/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
@@ -186,7 +186,7 @@ static bool compareUshort2(mm_ushort2 a, mm_ushort2 b) {
 void OpenCLNonbondedUtilities::initialize(const System& system) {
    if (atomExclusions.size() == 0) {
        // No exclusions were specifically requested, so just mark every atom as not interacting with itself.
-        
+
        atomExclusions.resize(context.getNumAtoms());
        for (int i = 0; i < (int) atomExclusions.size(); i++)
            atomExclusions[i].push_back(i);
@@ -199,7 +199,7 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
    setAtomBlockRange(context.getContextIndex()/(double) numContexts, (context.getContextIndex()+1)/(double) numContexts);

    // Build a list of tiles that contain exclusions.
-    
+
    set<pair<int, int> > tilesWithExclusions;
    for (int atom1 = 0; atom1 < (int) atomExclusions.size(); ++atom1) {
        int x = atom1/OpenCLContext::TileSize;
@@ -410,7 +410,7 @@ void OpenCLNonbondedUtilities::setAtomBlockRange(double startFraction, double en
    numTiles = (int) (endFraction*totalTiles)-startTileIndex;
    if (useCutoff) {
        // We are using a cutoff, and the kernels have already been created.
-        
+
        for (map<int, KernelSet>::iterator iter = groupKernels.begin(); iter != groupKernels.end(); ++iter) {
            iter->second.forceKernel.setArg<cl_uint>(5, startTileIndex);
            iter->second.forceKernel.setArg<cl_uint>(6, numTiles);
@@ -491,7 +491,7 @@ void OpenCLNonbondedUtilities::createKernelsForGroups(int groups) {
            kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(18, rebuildNeighborList->getDeviceBuffer());
            if (kernels.findInteractingBlocksKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice()) < groupSize) {
                // The device can't handle this block size, so reduce it.
-                
+
                groupSize -= 32;
                if (groupSize < 32)
                    throw OpenMMException("Failed to create findInteractingBlocks kernel");

--- a/platforms/opencl/src/kernels/nonbonded.cl
+++ b/platforms/opencl/src/kernels/nonbonded.cl
@@ -25,7 +25,7 @@ __kernel void computeNonbonded(
        __global real* restrict energyBuffer, __global const real4* restrict posq, __global const unsigned int* restrict exclusions,
        __global const ushort2* restrict exclusionTiles, unsigned int startTileIndex, unsigned int numTileIndices
 #ifdef USE_CUTOFF
-        , __global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, 
+        , __global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter,
        __global const real4* restrict blockSize, __global const int* restrict interactingAtoms
 #endif
@@ -38,7 +38,7 @@ __kernel void computeNonbonded(
    __local AtomData localData[FORCE_WORK_GROUP_SIZE];

    // First loop: process tiles that contain exclusions.
-    
+
    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
@@ -100,7 +100,7 @@ __kernel void computeNonbonded(
        }
        else {
            // This is an off-diagonal tile.
-            
+
            const unsigned int localAtomIndex = get_local_id(0);
            unsigned int j = y*TILE_SIZE + tgx;
            real4 tempPosq = posq[j];
@@ -126,7 +126,7 @@ __kernel void computeNonbonded(
 #endif
                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
 #ifdef PRUNE_BY_CUTOFF
-                if (r2 < CUTOFF_SQUARED) {
+                if (r2 < MAX_CUTOFF*MAX_CUTOFF) {
 #endif
                    real invR = RSQRT(r2);
                    real r = r2*invR;
@@ -213,7 +213,7 @@ __kernel void computeNonbonded(
        bool includeTile = true;

        // Extract the coordinates of this tile.
-        
+
        int x, y;
        bool singlePeriodicCopy = false;
 #ifdef USE_CUTOFF
@@ -245,7 +245,7 @@ __kernel void computeNonbonded(
                }
                else
                    skipTiles[get_local_id(0)] = end;
-                skipBase += TILE_SIZE;            
+                skipBase += TILE_SIZE;
                currentSkipIndex = tbx;
                SYNC_WARPS;
            }
@@ -300,7 +300,7 @@ __kernel void computeNonbonded(
                    real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
 #ifdef PRUNE_BY_CUTOFF
-                    if (r2 < CUTOFF_SQUARED) {
+                    if (r2 < MAX_CUTOFF*MAX_CUTOFF) {
 #endif
                        real invR = RSQRT(r2);
                        real r = r2*invR;
@@ -352,7 +352,7 @@ __kernel void computeNonbonded(
 #endif
                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
 #ifdef PRUNE_BY_CUTOFF
-                    if (r2 < CUTOFF_SQUARED) {
+                    if (r2 < MAX_CUTOFF*MAX_CUTOFF) {
 #endif
                        real invR = RSQRT(r2);
                        real r = r2*invR;

--- a/platforms/opencl/src/kernels/nonbonded_cpu.cl
+++ b/platforms/opencl/src/kernels/nonbonded_cpu.cl
@@ -22,7 +22,7 @@ __kernel void computeNonbonded(
        __global real* restrict energyBuffer, __global const real4* restrict posq, __global const unsigned int* restrict exclusions,
        __global const ushort2* restrict exclusionTiles, unsigned int startTileIndex, unsigned int numTileIndices
 #ifdef USE_CUTOFF
-        , __global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, 
+        , __global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter,
        __global const real4* restrict blockSize, __global const int* restrict interactingAtoms
 #endif
@@ -31,7 +31,7 @@ __kernel void computeNonbonded(
    __local AtomData localData[TILE_SIZE];

    // First loop: process tiles that contain exclusions.
-    
+
    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+get_group_id(0)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(get_group_id(0)+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
@@ -70,7 +70,7 @@ __kernel void computeNonbonded(
 #endif
                    real r2 = dot(delta.xyz, delta.xyz);
 #ifdef USE_CUTOFF
-                    if (r2 < CUTOFF_SQUARED) {
+                    if (r2 < MAX_CUTOFF*MAX_CUTOFF) {
 #endif
                        real invR = RSQRT(r2);
                        real r = r2*invR;
@@ -138,7 +138,7 @@ __kernel void computeNonbonded(
 #endif
                    real r2 = dot(delta.xyz, delta.xyz);
 #ifdef USE_CUTOFF
-                    if (r2 < CUTOFF_SQUARED) {
+                    if (r2 < MAX_CUTOFF*MAX_CUTOFF) {
 #endif
                        real invR = RSQRT(r2);
                        real r = r2*invR;
@@ -228,9 +228,9 @@ __kernel void computeNonbonded(
    while (pos < end) {
        const bool hasExclusions = false;
        bool includeTile = true;
-        
+
        // Extract the coordinates of this tile.
-        
+
        int x, y;
        bool singlePeriodicCopy = false;
 #ifdef USE_CUTOFF
@@ -304,7 +304,7 @@ __kernel void computeNonbonded(
                        real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
                        real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
                        real r2 = dot(delta.xyz, delta.xyz);
-                        if (r2 < CUTOFF_SQUARED) {
+                        if (r2 < MAX_CUTOFF*MAX_CUTOFF) {
                            real invR = RSQRT(r2);
                            real r = r2*invR;
                            unsigned int atom2 = j;
@@ -367,7 +367,7 @@ __kernel void computeNonbonded(
 #endif
                        real r2 = dot(delta.xyz, delta.xyz);
 #ifdef USE_CUTOFF
-                        if (r2 < CUTOFF_SQUARED) {
+                        if (r2 < MAX_CUTOFF*MAX_CUTOFF) {
 #endif
                            real invR = RSQRT(r2);
                            real r = r2*invR;