Workaround for Nvidia compiler bug where compute level 1.2 processors try to...

Workaround for Nvidia compiler bug where compute level 1.2 processors try to compile double precision code

Workaround for Nvidia compiler bug where compute level 1.2 processors try to...
Workaround for Nvidia compiler bug where compute level 1.2 processors try to compile double precision code
b5009324 · Peter Eastman · b79d9e84 · b5009324 · b5009324 · b5009324
Commit b5009324 authored Dec 01, 2011 by Peter Eastman
5 changed files
--- a/platforms/opencl/src/OpenCLContext.cpp
+++ b/platforms/opencl/src/OpenCLContext.cpp
@@ -105,6 +105,7 @@ OpenCLContext::OpenCLContext(int numParticles, int platformIndex, int deviceInde
        compilationDefines["WORK_GROUP_SIZE"] = OpenCLExpressionUtilities::intToString(ThreadBlockSize);
        defaultOptimizationOptions = "-cl-fast-relaxed-math";
        supports64BitGlobalAtomics = (device.getInfo<CL_DEVICE_EXTENSIONS>().find("cl_khr_int64_base_atomics") != string::npos);
+        supportsDoublePrecision = (device.getInfo<CL_DEVICE_EXTENSIONS>().find("cl_khr_fp64") != string::npos);
        string vendor = device.getInfo<CL_DEVICE_VENDOR>();
        if (vendor.size() >= 6 && vendor.substr(0, 6) == "NVIDIA") {
            compilationDefines["WARPS_ARE_ATOMIC"] = "";
@@ -131,6 +132,8 @@ OpenCLContext::OpenCLContext(int numParticles, int platformIndex, int deviceInde
            simdWidth = 1;
        if (supports64BitGlobalAtomics)
            compilationDefines["SUPPORTS_64_BIT_ATOMICS"] = "";
+        if (supportsDoublePrecision)
+            compilationDefines["SUPPORTS_DOUBLE_PRECISION"] = "";
        queue = cl::CommandQueue(context, device);
        numAtoms = numParticles;
        paddedNumAtoms = TileSize*((numParticles+TileSize-1)/TileSize);

--- a/platforms/opencl/src/OpenCLContext.h
+++ b/platforms/opencl/src/OpenCLContext.h
@@ -401,6 +401,12 @@ public:
    bool getSupports64BitGlobalAtomics() {
        return supports64BitGlobalAtomics;
    }
+    /**
+     * Get whether the device being used supports double precision math.
+     */
+    bool getSupportsDoublePrecision() {
+        return supportsDoublePrecision;
+    }
    /**
     * Get the size of the periodic box.
     */
@@ -478,7 +484,7 @@ private:
    int numThreadBlocks;
    int numForceBuffers;
    int simdWidth;
-    bool supports64BitGlobalAtomics, atomsWereReordered;
+    bool supports64BitGlobalAtomics, supportsDoublePrecision, atomsWereReordered;
    mm_float4 periodicBoxSize;
    mm_float4 invPeriodicBoxSize;
    std::string defaultOptimizationOptions;

--- a/platforms/opencl/src/kernels/langevin.cl
+++ b/platforms/opencl/src/kernels/langevin.cl
-#ifdef cl_khr_fp64
+#ifdef SUPPORTS_DOUBLE_PRECISION
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 #endif
@@ -32,7 +32,7 @@ __kernel void integrateLangevinPart1(__global float4* restrict velm, __global co
 */
 __kernel void integrateLangevinPart2(__global float4* restrict posq, __global const float4* restrict posDelta, __global float4* restrict velm, __global const float2* restrict dt) {
-#ifdef cl_khr_fp64
+#ifdef SUPPORTS_DOUBLE_PRECISION
    double invStepSize = 1.0/dt[0].y;
 #else
    float invStepSize = 1.0f/dt[0].y;
@@ -43,7 +43,7 @@ __kernel void integrateLangevinPart2(__global float4* restrict posq, __global co
        float4 delta = posDelta[index];
        float4 vel = velm[index];
        pos.xyz += delta.xyz;
-#ifdef cl_khr_fp64
+#ifdef SUPPORTS_DOUBLE_PRECISION
        vel.xyz = convert_float4(invStepSize*convert_double4(delta)).xyz;
 #else
        vel.xyz = invStepSize*delta.xyz;

--- a/platforms/opencl/src/kernels/verlet.cl
+++ b/platforms/opencl/src/kernels/verlet.cl
-#ifdef cl_khr_fp64
+#ifdef SUPPORTS_DOUBLE_PRECISION
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 #endif
@@ -28,7 +28,7 @@ __kernel void integrateVerletPart1(int numAtoms, __global const float2* restrict
 __kernel void integrateVerletPart2(int numAtoms, __global float2* restrict dt, __global float4* restrict posq, __global float4* restrict velm, __global const float4* restrict posDelta) {
    float2 stepSize = dt[0];
-#ifdef cl_khr_fp64
+#ifdef SUPPORTS_DOUBLE_PRECISION
    double oneOverDt = 1.0/stepSize.y;
 #else
    float oneOverDt = 1.0f/stepSize.y;
@@ -42,7 +42,7 @@ __kernel void integrateVerletPart2(int numAtoms, __global float2* restrict dt, _
        float4 delta = posDelta[index];
        float4 velocity = velm[index];
        pos.xyz += delta.xyz;
-#ifdef cl_khr_fp64
+#ifdef SUPPORTS_DOUBLE_PRECISION
        velocity.xyz = convert_float4(convert_double4(delta)*oneOverDt).xyz;
 #else
        velocity.xyz = delta.xyz*oneOverDt;

--- a/plugins/rpmd/platforms/opencl/src/kernels/rpmd.cl
+++ b/plugins/rpmd/platforms/opencl/src/kernels/rpmd.cl
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-#endif
 float4 multiplyComplexRealPart(float2 c1, float4 c2r, float4 c2i) {
    return c1.x*c2r-c1.y*c2i;
 }