Commit 3946c025 authored by peastman's avatar peastman
Browse files

Merge pull request #878 from peastman/applecl

Enabled OpenCL on OS X
parents 013cf20e aacefccf
...@@ -405,11 +405,11 @@ MARK_AS_ADVANCED(CUDA_BUILD_CUBIN) ...@@ -405,11 +405,11 @@ MARK_AS_ADVANCED(CUDA_BUILD_CUBIN)
MARK_AS_ADVANCED(CUDA_BUILD_EMULATION) MARK_AS_ADVANCED(CUDA_BUILD_EMULATION)
FIND_PACKAGE(OpenCL QUIET) FIND_PACKAGE(OpenCL QUIET)
IF(OPENCL_FOUND AND NOT APPLE) IF(OPENCL_FOUND)
SET(OPENMM_BUILD_OPENCL_LIB ON CACHE BOOL "Build OpenMMOpenCL library") SET(OPENMM_BUILD_OPENCL_LIB ON CACHE BOOL "Build OpenMMOpenCL library")
ELSE(OPENCL_FOUND AND NOT APPLE) ELSE(OPENCL_FOUND)
SET(OPENMM_BUILD_OPENCL_LIB OFF CACHE BOOL "Build OpenMMOpenCL library") SET(OPENMM_BUILD_OPENCL_LIB OFF CACHE BOOL "Build OpenMMOpenCL library")
ENDIF(OPENCL_FOUND AND NOT APPLE) ENDIF(OPENCL_FOUND)
IF(OPENMM_BUILD_OPENCL_LIB) IF(OPENMM_BUILD_OPENCL_LIB)
ADD_SUBDIRECTORY(platforms/opencl) ADD_SUBDIRECTORY(platforms/opencl)
ENDIF(OPENMM_BUILD_OPENCL_LIB) ENDIF(OPENMM_BUILD_OPENCL_LIB)
......
...@@ -106,9 +106,8 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device ...@@ -106,9 +106,8 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
// if they supplied a valid deviceIndex, we only look through that one // if they supplied a valid deviceIndex, we only look through that one
if (i != deviceIndex && deviceIndex >= 0 && deviceIndex < (int) devices.size()) if (i != deviceIndex && deviceIndex >= 0 && deviceIndex < (int) devices.size())
continue; continue;
if (platformVendor == "Apple" && (devices[i].getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU))
if (platformVendor == "Apple" && (devices[i].getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU || devices[i].getInfo<CL_DEVICE_VENDOR>() == "AMD")) continue; // The CPU device on OS X won't work correctly.
continue; // The CPU device on OS X won't work correctly, and there are serious bugs using AMD GPUs.
int maxSize = devices[i].getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>()[0]; int maxSize = devices[i].getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>()[0];
int processingElementsPerComputeUnit = 8; int processingElementsPerComputeUnit = 8;
if (devices[i].getInfo<CL_DEVICE_TYPE>() != CL_DEVICE_TYPE_GPU) { if (devices[i].getInfo<CL_DEVICE_TYPE>() != CL_DEVICE_TYPE_GPU) {
...@@ -170,6 +169,8 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device ...@@ -170,6 +169,8 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
compilationDefines["WORK_GROUP_SIZE"] = intToString(ThreadBlockSize); compilationDefines["WORK_GROUP_SIZE"] = intToString(ThreadBlockSize);
if (platformVendor.size() >= 5 && platformVendor.substr(0, 5) == "Intel") if (platformVendor.size() >= 5 && platformVendor.substr(0, 5) == "Intel")
defaultOptimizationOptions = ""; defaultOptimizationOptions = "";
else if (platformVendor == "Apple")
defaultOptimizationOptions = "-cl-mad-enable -cl-no-signed-zeros";
else else
defaultOptimizationOptions = "-cl-fast-relaxed-math"; defaultOptimizationOptions = "-cl-fast-relaxed-math";
supports64BitGlobalAtomics = (device.getInfo<CL_DEVICE_EXTENSIONS>().find("cl_khr_int64_base_atomics") != string::npos); supports64BitGlobalAtomics = (device.getInfo<CL_DEVICE_EXTENSIONS>().find("cl_khr_int64_base_atomics") != string::npos);
...@@ -241,8 +242,6 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device ...@@ -241,8 +242,6 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
} }
else else
simdWidth = 1; simdWidth = 1;
if (platformVendor == "Apple" && vendor == "AMD")
compilationDefines["MAC_AMD_WORKAROUND"] = "";
if (supports64BitGlobalAtomics) if (supports64BitGlobalAtomics)
compilationDefines["SUPPORTS_64_BIT_ATOMICS"] = ""; compilationDefines["SUPPORTS_64_BIT_ATOMICS"] = "";
if (supportsDoublePrecision) if (supportsDoublePrecision)
......
...@@ -75,6 +75,7 @@ OpenCLFFT3D::OpenCLFFT3D(OpenCLContext& context, int xsize, int ysize, int zsize ...@@ -75,6 +75,7 @@ OpenCLFFT3D::OpenCLFFT3D(OpenCLContext& context, int xsize, int ysize, int zsize
defines["PACKED_XSIZE"] = context.intToString(packedXSize); defines["PACKED_XSIZE"] = context.intToString(packedXSize);
defines["PACKED_YSIZE"] = context.intToString(packedYSize); defines["PACKED_YSIZE"] = context.intToString(packedYSize);
defines["PACKED_ZSIZE"] = context.intToString(packedZSize); defines["PACKED_ZSIZE"] = context.intToString(packedZSize);
defines["M_PI"] = context.doubleToString(M_PI);
cl::Program program = context.createProgram(OpenCLKernelSources::fftR2C, defines); cl::Program program = context.createProgram(OpenCLKernelSources::fftR2C, defines);
packForwardKernel = cl::Kernel(program, "packForwardData"); packForwardKernel = cl::Kernel(program, "packForwardData");
unpackForwardKernel = cl::Kernel(program, "unpackForwardData"); unpackForwardKernel = cl::Kernel(program, "unpackForwardData");
......
...@@ -1628,6 +1628,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb ...@@ -1628,6 +1628,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
pmeDefines["GRID_SIZE_Y"] = cl.intToString(gridSizeY); pmeDefines["GRID_SIZE_Y"] = cl.intToString(gridSizeY);
pmeDefines["GRID_SIZE_Z"] = cl.intToString(gridSizeZ); pmeDefines["GRID_SIZE_Z"] = cl.intToString(gridSizeZ);
pmeDefines["EPSILON_FACTOR"] = cl.doubleToString(sqrt(ONE_4PI_EPS0)); pmeDefines["EPSILON_FACTOR"] = cl.doubleToString(sqrt(ONE_4PI_EPS0));
pmeDefines["M_PI"] = cl.doubleToString(M_PI);
bool deviceIsCpu = (cl.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU); bool deviceIsCpu = (cl.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
if (deviceIsCpu) if (deviceIsCpu)
pmeDefines["DEVICE_IS_CPU"] = "1"; pmeDefines["DEVICE_IS_CPU"] = "1";
......
...@@ -109,7 +109,7 @@ bool OpenCLPlatform::supportsDoublePrecision() const { ...@@ -109,7 +109,7 @@ bool OpenCLPlatform::supportsDoublePrecision() const {
bool OpenCLPlatform::isPlatformSupported() { bool OpenCLPlatform::isPlatformSupported() {
// Return false for OpenCL implementations that are known // Return false for OpenCL implementations that are known
// to be buggy (Apple OSX since 10.7.5) // to be buggy (Apple OS X prior to 10.10).
#ifdef __APPLE__ #ifdef __APPLE__
char str[256]; char str[256];
...@@ -122,12 +122,10 @@ bool OpenCLPlatform::isPlatformSupported() { ...@@ -122,12 +122,10 @@ bool OpenCLPlatform::isPlatformSupported() {
if (sscanf(str, "%d.%d.%d", &major, &minor, &micro) != 3) if (sscanf(str, "%d.%d.%d", &major, &minor, &micro) != 3)
return false; return false;
if ((major > 11) || (major == 11 && minor > 4) || (major == 11 && minor == 4 && micro >= 2)) if (major < 14 || (major == 14 && minor < 3))
// 11.4.2 is the darwin release corresponding to OSX 10.7.5, which is the // 14.3.0 is the darwin release corresponding to OS X 10.10.3. Versions prior to that
// point at which a number of serious bugs were introduced into the // contained a number of serious bugs in the Apple OpenCL libraries.
// Apple OpenCL libraries, resulting in catistrophically incorrect MD simulations // (See https://github.com/SimTk/openmm/issues/395 for example.)
// (see https://github.com/SimTk/openmm/issues/395 for example). Once a fix is released,
// this version check should be updated.
return false; return false;
#endif #endif
......
...@@ -42,7 +42,6 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le ...@@ -42,7 +42,6 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
replacements["MIN_KEY"] = trait->getMinKey(); replacements["MIN_KEY"] = trait->getMinKey();
replacements["MAX_KEY"] = trait->getMaxKey(); replacements["MAX_KEY"] = trait->getMaxKey();
replacements["MAX_VALUE"] = trait->getMaxValue(); replacements["MAX_VALUE"] = trait->getMaxValue();
replacements["VALUE_IS_INT2"] = (trait->getDataType() == std::string("int2") ? "1" : "0");
cl::Program program = context.createProgram(context.replaceStrings(OpenCLKernelSources::sort, replacements)); cl::Program program = context.createProgram(context.replaceStrings(OpenCLKernelSources::sort, replacements));
shortListKernel = cl::Kernel(program, "sortShortList"); shortListKernel = cl::Kernel(program, "sortShortList");
computeRangeKernel = cl::Kernel(program, "computeRange"); computeRangeKernel = cl::Kernel(program, "computeRange");
......
...@@ -109,13 +109,7 @@ __kernel void assignElementsToBuckets(__global const DATA_TYPE* restrict data, u ...@@ -109,13 +109,7 @@ __kernel void assignElementsToBuckets(__global const DATA_TYPE* restrict data, u
float maxValue = (float) (range[1]); float maxValue = (float) (range[1]);
float bucketWidth = (maxValue-minValue)/numBuckets; float bucketWidth = (maxValue-minValue)/numBuckets;
for (uint index = get_global_id(0); index < length; index += get_global_size(0)) { for (uint index = get_global_id(0); index < length; index += get_global_size(0)) {
#if defined(MAC_AMD_WORKAROUND) && VALUE_IS_INT2
__global int* d = (__global int*) data;
int2 element = (int2) (d[2*index], d[2*index+1]);
float key = (float) getValue(element);
#else
float key = (float) getValue(data[index]); float key = (float) getValue(data[index]);
#endif
uint bucketIndex = min((uint) ((key-minValue)/bucketWidth), numBuckets-1); uint bucketIndex = min((uint) ((key-minValue)/bucketWidth), numBuckets-1);
offsetInBucket[index] = atom_inc(&bucketOffset[bucketIndex]); offsetInBucket[index] = atom_inc(&bucketOffset[bucketIndex]);
bucketOfElement[index] = bucketIndex; bucketOfElement[index] = bucketIndex;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment