Commit b5009324 authored by Peter Eastman's avatar Peter Eastman
Browse files

Workaround for Nvidia compiler bug where compute level 1.2 processors try to...

Workaround for Nvidia compiler bug where compute level 1.2 processors try to compile double precision code
parent b79d9e84
...@@ -105,6 +105,7 @@ OpenCLContext::OpenCLContext(int numParticles, int platformIndex, int deviceInde ...@@ -105,6 +105,7 @@ OpenCLContext::OpenCLContext(int numParticles, int platformIndex, int deviceInde
compilationDefines["WORK_GROUP_SIZE"] = OpenCLExpressionUtilities::intToString(ThreadBlockSize); compilationDefines["WORK_GROUP_SIZE"] = OpenCLExpressionUtilities::intToString(ThreadBlockSize);
defaultOptimizationOptions = "-cl-fast-relaxed-math"; defaultOptimizationOptions = "-cl-fast-relaxed-math";
supports64BitGlobalAtomics = (device.getInfo<CL_DEVICE_EXTENSIONS>().find("cl_khr_int64_base_atomics") != string::npos); supports64BitGlobalAtomics = (device.getInfo<CL_DEVICE_EXTENSIONS>().find("cl_khr_int64_base_atomics") != string::npos);
supportsDoublePrecision = (device.getInfo<CL_DEVICE_EXTENSIONS>().find("cl_khr_fp64") != string::npos);
string vendor = device.getInfo<CL_DEVICE_VENDOR>(); string vendor = device.getInfo<CL_DEVICE_VENDOR>();
if (vendor.size() >= 6 && vendor.substr(0, 6) == "NVIDIA") { if (vendor.size() >= 6 && vendor.substr(0, 6) == "NVIDIA") {
compilationDefines["WARPS_ARE_ATOMIC"] = ""; compilationDefines["WARPS_ARE_ATOMIC"] = "";
...@@ -131,6 +132,8 @@ OpenCLContext::OpenCLContext(int numParticles, int platformIndex, int deviceInde ...@@ -131,6 +132,8 @@ OpenCLContext::OpenCLContext(int numParticles, int platformIndex, int deviceInde
simdWidth = 1; simdWidth = 1;
if (supports64BitGlobalAtomics) if (supports64BitGlobalAtomics)
compilationDefines["SUPPORTS_64_BIT_ATOMICS"] = ""; compilationDefines["SUPPORTS_64_BIT_ATOMICS"] = "";
if (supportsDoublePrecision)
compilationDefines["SUPPORTS_DOUBLE_PRECISION"] = "";
queue = cl::CommandQueue(context, device); queue = cl::CommandQueue(context, device);
numAtoms = numParticles; numAtoms = numParticles;
paddedNumAtoms = TileSize*((numParticles+TileSize-1)/TileSize); paddedNumAtoms = TileSize*((numParticles+TileSize-1)/TileSize);
......
...@@ -401,6 +401,12 @@ public: ...@@ -401,6 +401,12 @@ public:
bool getSupports64BitGlobalAtomics() { bool getSupports64BitGlobalAtomics() {
return supports64BitGlobalAtomics; return supports64BitGlobalAtomics;
} }
/**
* Get whether the device being used supports double precision math.
*/
bool getSupportsDoublePrecision() {
return supportsDoublePrecision;
}
/** /**
* Get the size of the periodic box. * Get the size of the periodic box.
*/ */
...@@ -478,7 +484,7 @@ private: ...@@ -478,7 +484,7 @@ private:
int numThreadBlocks; int numThreadBlocks;
int numForceBuffers; int numForceBuffers;
int simdWidth; int simdWidth;
bool supports64BitGlobalAtomics, atomsWereReordered; bool supports64BitGlobalAtomics, supportsDoublePrecision, atomsWereReordered;
mm_float4 periodicBoxSize; mm_float4 periodicBoxSize;
mm_float4 invPeriodicBoxSize; mm_float4 invPeriodicBoxSize;
std::string defaultOptimizationOptions; std::string defaultOptimizationOptions;
......
#ifdef cl_khr_fp64 #ifdef SUPPORTS_DOUBLE_PRECISION
#pragma OPENCL EXTENSION cl_khr_fp64 : enable #pragma OPENCL EXTENSION cl_khr_fp64 : enable
#endif #endif
...@@ -32,7 +32,7 @@ __kernel void integrateLangevinPart1(__global float4* restrict velm, __global co ...@@ -32,7 +32,7 @@ __kernel void integrateLangevinPart1(__global float4* restrict velm, __global co
*/ */
__kernel void integrateLangevinPart2(__global float4* restrict posq, __global const float4* restrict posDelta, __global float4* restrict velm, __global const float2* restrict dt) { __kernel void integrateLangevinPart2(__global float4* restrict posq, __global const float4* restrict posDelta, __global float4* restrict velm, __global const float2* restrict dt) {
#ifdef cl_khr_fp64 #ifdef SUPPORTS_DOUBLE_PRECISION
double invStepSize = 1.0/dt[0].y; double invStepSize = 1.0/dt[0].y;
#else #else
float invStepSize = 1.0f/dt[0].y; float invStepSize = 1.0f/dt[0].y;
...@@ -43,7 +43,7 @@ __kernel void integrateLangevinPart2(__global float4* restrict posq, __global co ...@@ -43,7 +43,7 @@ __kernel void integrateLangevinPart2(__global float4* restrict posq, __global co
float4 delta = posDelta[index]; float4 delta = posDelta[index];
float4 vel = velm[index]; float4 vel = velm[index];
pos.xyz += delta.xyz; pos.xyz += delta.xyz;
#ifdef cl_khr_fp64 #ifdef SUPPORTS_DOUBLE_PRECISION
vel.xyz = convert_float4(invStepSize*convert_double4(delta)).xyz; vel.xyz = convert_float4(invStepSize*convert_double4(delta)).xyz;
#else #else
vel.xyz = invStepSize*delta.xyz; vel.xyz = invStepSize*delta.xyz;
......
#ifdef cl_khr_fp64 #ifdef SUPPORTS_DOUBLE_PRECISION
#pragma OPENCL EXTENSION cl_khr_fp64 : enable #pragma OPENCL EXTENSION cl_khr_fp64 : enable
#endif #endif
...@@ -28,7 +28,7 @@ __kernel void integrateVerletPart1(int numAtoms, __global const float2* restrict ...@@ -28,7 +28,7 @@ __kernel void integrateVerletPart1(int numAtoms, __global const float2* restrict
__kernel void integrateVerletPart2(int numAtoms, __global float2* restrict dt, __global float4* restrict posq, __global float4* restrict velm, __global const float4* restrict posDelta) { __kernel void integrateVerletPart2(int numAtoms, __global float2* restrict dt, __global float4* restrict posq, __global float4* restrict velm, __global const float4* restrict posDelta) {
float2 stepSize = dt[0]; float2 stepSize = dt[0];
#ifdef cl_khr_fp64 #ifdef SUPPORTS_DOUBLE_PRECISION
double oneOverDt = 1.0/stepSize.y; double oneOverDt = 1.0/stepSize.y;
#else #else
float oneOverDt = 1.0f/stepSize.y; float oneOverDt = 1.0f/stepSize.y;
...@@ -42,7 +42,7 @@ __kernel void integrateVerletPart2(int numAtoms, __global float2* restrict dt, _ ...@@ -42,7 +42,7 @@ __kernel void integrateVerletPart2(int numAtoms, __global float2* restrict dt, _
float4 delta = posDelta[index]; float4 delta = posDelta[index];
float4 velocity = velm[index]; float4 velocity = velm[index];
pos.xyz += delta.xyz; pos.xyz += delta.xyz;
#ifdef cl_khr_fp64 #ifdef SUPPORTS_DOUBLE_PRECISION
velocity.xyz = convert_float4(convert_double4(delta)*oneOverDt).xyz; velocity.xyz = convert_float4(convert_double4(delta)*oneOverDt).xyz;
#else #else
velocity.xyz = delta.xyz*oneOverDt; velocity.xyz = delta.xyz*oneOverDt;
......
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#endif
float4 multiplyComplexRealPart(float2 c1, float4 c2r, float4 c2i) { float4 multiplyComplexRealPart(float2 c1, float4 c2r, float4 c2i) {
return c1.x*c2r-c1.y*c2i; return c1.x*c2r-c1.y*c2i;
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment