Merge pull request #878 from peastman/applecl

Enabled OpenCL on OS X

Merge pull request #878 from peastman/applecl
Enabled OpenCL on OS X
3946c025 · peastman · 013cf20e · aacefccf · 3946c025 · 3946c025
Commit 3946c025 authored Apr 16, 2015 by peastman
7 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -405,11 +405,11 @@ MARK_AS_ADVANCED(CUDA_BUILD_CUBIN)
 MARK_AS_ADVANCED(CUDA_BUILD_EMULATION)
 FIND_PACKAGE(OpenCL QUIET)
-IF(OPENCL_FOUND AND NOT APPLE)
+IF(OPENCL_FOUND)
    SET(OPENMM_BUILD_OPENCL_LIB ON CACHE BOOL "Build OpenMMOpenCL library")
-ELSE(OPENCL_FOUND AND NOT APPLE)
+ELSE(OPENCL_FOUND)
    SET(OPENMM_BUILD_OPENCL_LIB OFF CACHE BOOL "Build OpenMMOpenCL library")
-ENDIF(OPENCL_FOUND AND NOT APPLE)
+ENDIF(OPENCL_FOUND)
 IF(OPENMM_BUILD_OPENCL_LIB)
    ADD_SUBDIRECTORY(platforms/opencl)
 ENDIF(OPENMM_BUILD_OPENCL_LIB)

--- a/platforms/opencl/src/OpenCLContext.cpp
+++ b/platforms/opencl/src/OpenCLContext.cpp
@@ -106,9 +106,8 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
                // if they supplied a valid deviceIndex, we only look through that one
                if (i != deviceIndex && deviceIndex >= 0 && deviceIndex < (int) devices.size())
                    continue;
+                if (platformVendor == "Apple" && (devices[i].getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU))
-                if (platformVendor == "Apple" && (devices[i].getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU || devices[i].getInfo<CL_DEVICE_VENDOR>() == "AMD"))
+                    continue; // The CPU device on OS X won't work correctly.
-                    continue; // The CPU device on OS X won't work correctly, and there are serious bugs using AMD GPUs.
                int maxSize = devices[i].getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>()[0];
                int processingElementsPerComputeUnit = 8;
                if (devices[i].getInfo<CL_DEVICE_TYPE>() != CL_DEVICE_TYPE_GPU) {
@@ -170,6 +169,8 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
        compilationDefines["WORK_GROUP_SIZE"] = intToString(ThreadBlockSize);
        if (platformVendor.size() >= 5 && platformVendor.substr(0, 5) == "Intel")
            defaultOptimizationOptions = "";
+        else if (platformVendor == "Apple")
+            defaultOptimizationOptions = "-cl-mad-enable -cl-no-signed-zeros";
        else
            defaultOptimizationOptions = "-cl-fast-relaxed-math";
        supports64BitGlobalAtomics = (device.getInfo<CL_DEVICE_EXTENSIONS>().find("cl_khr_int64_base_atomics") != string::npos);
@@ -241,8 +242,6 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
        }
        else
            simdWidth = 1;
-        if (platformVendor == "Apple" && vendor == "AMD")
-            compilationDefines["MAC_AMD_WORKAROUND"] = "";
        if (supports64BitGlobalAtomics)
            compilationDefines["SUPPORTS_64_BIT_ATOMICS"] = "";
        if (supportsDoublePrecision)

--- a/platforms/opencl/src/OpenCLFFT3D.cpp
+++ b/platforms/opencl/src/OpenCLFFT3D.cpp
@@ -75,6 +75,7 @@ OpenCLFFT3D::OpenCLFFT3D(OpenCLContext& context, int xsize, int ysize, int zsize
            defines["PACKED_XSIZE"] = context.intToString(packedXSize);
            defines["PACKED_YSIZE"] = context.intToString(packedYSize);
            defines["PACKED_ZSIZE"] = context.intToString(packedZSize);
+            defines["M_PI"] = context.doubleToString(M_PI);
            cl::Program program = context.createProgram(OpenCLKernelSources::fftR2C, defines);
            packForwardKernel = cl::Kernel(program, "packForwardData");
            unpackForwardKernel = cl::Kernel(program, "unpackForwardData");

--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
@@ -1628,6 +1628,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
            pmeDefines["GRID_SIZE_Y"] = cl.intToString(gridSizeY);
            pmeDefines["GRID_SIZE_Z"] = cl.intToString(gridSizeZ);
            pmeDefines["EPSILON_FACTOR"] = cl.doubleToString(sqrt(ONE_4PI_EPS0));
+            pmeDefines["M_PI"] = cl.doubleToString(M_PI);
            bool deviceIsCpu = (cl.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
            if (deviceIsCpu)
                pmeDefines["DEVICE_IS_CPU"] = "1";

--- a/platforms/opencl/src/OpenCLPlatform.cpp
+++ b/platforms/opencl/src/OpenCLPlatform.cpp
@@ -109,7 +109,7 @@ bool OpenCLPlatform::supportsDoublePrecision() const {
 bool OpenCLPlatform::isPlatformSupported() {
    // Return false for OpenCL implementations that are known
-    // to be buggy (Apple OSX since 10.7.5)
+    // to be buggy (Apple OS X prior to 10.10).
 #ifdef __APPLE__
    char str[256];
@@ -122,12 +122,10 @@ bool OpenCLPlatform::isPlatformSupported() {
    if (sscanf(str, "%d.%d.%d", &major, &minor, &micro) != 3)
        return false;
-    if ((major > 11) || (major == 11 && minor > 4) || (major == 11 && minor == 4 && micro >= 2))
+    if (major < 14 || (major == 14 && minor < 3))
-        // 11.4.2 is the darwin release corresponding to OSX 10.7.5, which is the
+        // 14.3.0 is the darwin release corresponding to OS X 10.10.3. Versions prior to that
-        // point at which a number of serious bugs were introduced into the
+        // contained a number of serious bugs in the Apple OpenCL libraries.
-        // Apple OpenCL libraries, resulting in catistrophically incorrect MD simulations
+        // (See https://github.com/SimTk/openmm/issues/395 for example.)
-        // (see https://github.com/SimTk/openmm/issues/395 for example). Once a fix is released,
-        // this version check should be updated.
        return false;
 #endif

--- a/platforms/opencl/src/OpenCLSort.cpp
+++ b/platforms/opencl/src/OpenCLSort.cpp
@@ -42,7 +42,6 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
    replacements["MIN_KEY"] = trait->getMinKey();
    replacements["MAX_KEY"] = trait->getMaxKey();
    replacements["MAX_VALUE"] = trait->getMaxValue();
-    replacements["VALUE_IS_INT2"] = (trait->getDataType() == std::string("int2") ? "1" : "0");
    cl::Program program = context.createProgram(context.replaceStrings(OpenCLKernelSources::sort, replacements));
    shortListKernel = cl::Kernel(program, "sortShortList");
    computeRangeKernel = cl::Kernel(program, "computeRange");

--- a/platforms/opencl/src/kernels/sort.cl
+++ b/platforms/opencl/src/kernels/sort.cl
@@ -109,13 +109,7 @@ __kernel void assignElementsToBuckets(__global const DATA_TYPE* restrict data, u
    float maxValue = (float) (range[1]);
    float bucketWidth = (maxValue-minValue)/numBuckets;
    for (uint index = get_global_id(0); index < length; index += get_global_size(0)) {
-#if defined(MAC_AMD_WORKAROUND) && VALUE_IS_INT2
-        __global int* d = (__global int*) data;
-        int2 element = (int2) (d[2*index], d[2*index+1]);
-        float key = (float) getValue(element);
-#else
        float key = (float) getValue(data[index]);
-#endif
        uint bucketIndex = min((uint) ((key-minValue)/bucketWidth), numBuckets-1);
        offsetInBucket[index] = atom_inc(&bucketOffset[bucketIndex]);
        bucketOfElement[index] = bucketIndex;