Commit 6ab42996 authored by Peter Eastman's avatar Peter Eastman
Browse files

Change suggested by Tony Tye to simplify debugging by including all...

Change suggested by Tony Tye to simplify debugging by including all compilation options in the kernel source
parent c1f59f8e
...@@ -91,12 +91,12 @@ OpenCLContext::OpenCLContext(int numParticles, int deviceIndex, OpenCLPlatform:: ...@@ -91,12 +91,12 @@ OpenCLContext::OpenCLContext(int numParticles, int deviceIndex, OpenCLPlatform::
this->deviceIndex = deviceIndex; this->deviceIndex = deviceIndex;
if (device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>() < minThreadBlockSize) if (device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>() < minThreadBlockSize)
throw OpenMMException("The specified OpenCL device is not compatible with OpenMM"); throw OpenMMException("The specified OpenCL device is not compatible with OpenMM");
compilationOptions = "-DWORK_GROUP_SIZE="+OpenCLExpressionUtilities::intToString(ThreadBlockSize); compilationDefines["WORK_GROUP_SIZE"] = OpenCLExpressionUtilities::intToString(ThreadBlockSize);
defaultOptimizationOptions = "-cl-fast-relaxed-math"; defaultOptimizationOptions = "-cl-fast-relaxed-math";
supports64BitGlobalAtomics = (device.getInfo<CL_DEVICE_EXTENSIONS>().find("cl_khr_int64_base_atomics") != string::npos); supports64BitGlobalAtomics = (device.getInfo<CL_DEVICE_EXTENSIONS>().find("cl_khr_int64_base_atomics") != string::npos);
string vendor = device.getInfo<CL_DEVICE_VENDOR>(); string vendor = device.getInfo<CL_DEVICE_VENDOR>();
if (vendor.size() >= 6 && vendor.substr(0, 6) == "NVIDIA") { if (vendor.size() >= 6 && vendor.substr(0, 6) == "NVIDIA") {
compilationOptions += " -DWARPS_ARE_ATOMIC"; compilationDefines["WARPS_ARE_ATOMIC"] = "";
simdWidth = 32; simdWidth = 32;
if (device.getInfo<CL_DEVICE_EXTENSIONS>().find("cl_nv_device_attribute_query") != string::npos) { if (device.getInfo<CL_DEVICE_EXTENSIONS>().find("cl_nv_device_attribute_query") != string::npos) {
// Compute level 1.2 and later Nvidia GPUs support 64 bit atomics, even though they don't list the // Compute level 1.2 and later Nvidia GPUs support 64 bit atomics, even though they don't list the
...@@ -111,7 +111,7 @@ OpenCLContext::OpenCLContext(int numParticles, int deviceIndex, OpenCLPlatform:: ...@@ -111,7 +111,7 @@ OpenCLContext::OpenCLContext(int numParticles, int deviceIndex, OpenCLPlatform::
} }
else if (vendor.size() >= 28 && vendor.substr(0, 28) == "Advanced Micro Devices, Inc.") { else if (vendor.size() >= 28 && vendor.substr(0, 28) == "Advanced Micro Devices, Inc.") {
// AMD APP SDK 2.4 has a performance problem with atomics. Enable the work around. // AMD APP SDK 2.4 has a performance problem with atomics. Enable the work around.
compilationOptions += " -DAMD_ATOMIC_WORK_AROUND"; compilationDefines["AMD_ATOMIC_WORK_AROUND"] = "";
// AMD has both 32 and 64 width SIMDs. To determine need to create a kernel to query. // AMD has both 32 and 64 width SIMDs. To determine need to create a kernel to query.
// For now default to 1 which will use the default kernels. // For now default to 1 which will use the default kernels.
simdWidth = 1; simdWidth = 1;
...@@ -119,7 +119,7 @@ OpenCLContext::OpenCLContext(int numParticles, int deviceIndex, OpenCLPlatform:: ...@@ -119,7 +119,7 @@ OpenCLContext::OpenCLContext(int numParticles, int deviceIndex, OpenCLPlatform::
else else
simdWidth = 1; simdWidth = 1;
if (supports64BitGlobalAtomics) if (supports64BitGlobalAtomics)
compilationOptions += " -DSUPPORTS_64_BIT_ATOMICS"; compilationDefines["SUPPORTS_64_BIT_ATOMICS"] = "";
queue = cl::CommandQueue(context, device); queue = cl::CommandQueue(context, device);
numAtoms = numParticles; numAtoms = numParticles;
paddedNumAtoms = TileSize*((numParticles+TileSize-1)/TileSize); paddedNumAtoms = TileSize*((numParticles+TileSize-1)/TileSize);
...@@ -169,26 +169,11 @@ OpenCLContext::OpenCLContext(int numParticles, int deviceIndex, OpenCLPlatform:: ...@@ -169,26 +169,11 @@ OpenCLContext::OpenCLContext(int numParticles, int deviceIndex, OpenCLPlatform::
maxExpError = max(maxExpError, fabs(exp(v)-values[i].s4)/values[i].s4); maxExpError = max(maxExpError, fabs(exp(v)-values[i].s4)/values[i].s4);
maxLogError = max(maxLogError, fabs(log(v)-values[i].s5)/values[i].s5); maxLogError = max(maxLogError, fabs(log(v)-values[i].s5)/values[i].s5);
} }
if (maxSqrtError < 1e-6) compilationDefines["SQRT"] = (maxSqrtError < 1e-6) ? "native_sqrt" : "sqrt";
compilationOptions += " -DSQRT=native_sqrt"; compilationDefines["RSQRT"] = (maxRsqrtError < 1e-6) ? "native_rsqrt" : "rsqrt";
else compilationDefines["RECIP"] = (maxRecipError < 1e-6) ? "native_recip" : "1.0f/";
compilationOptions += " -DSQRT=sqrt"; compilationDefines["EXP"] = (maxExpError < 1e-6) ? "native_exp" : "exp";
if (maxRsqrtError < 1e-6) compilationDefines["LOG"] = (maxLogError < 1e-6) ? "native_log" : "log";
compilationOptions += " -DRSQRT=native_rsqrt";
else
compilationOptions += " -DRSQRT=rsqrt";
if (maxRecipError < 1e-6)
compilationOptions += " -DRECIP=native_recip";
else
compilationOptions += " -DRECIP=1.0f/";
if (maxExpError < 1e-6)
compilationOptions += " -DEXP=native_exp";
else
compilationOptions += " -DEXP=exp";
if (maxLogError < 1e-6)
compilationOptions += " -DLOG=native_log";
else
compilationOptions += " -DLOG=log";
// Create the work thread used for parallelization when running on multiple devices. // Create the work thread used for parallelization when running on multiple devices.
...@@ -280,18 +265,34 @@ cl::Program OpenCLContext::createProgram(const string source, const char* optimi ...@@ -280,18 +265,34 @@ cl::Program OpenCLContext::createProgram(const string source, const char* optimi
} }
cl::Program OpenCLContext::createProgram(const string source, const map<string, string>& defines, const char* optimizationFlags) { cl::Program OpenCLContext::createProgram(const string source, const map<string, string>& defines, const char* optimizationFlags) {
cl::Program::Sources sources(1, make_pair(source.c_str(), source.size())); string options = (optimizationFlags == NULL ? defaultOptimizationOptions : optimizationFlags);
stringstream src;
if (!options.empty())
src << "// Compilation Options: " << options << endl << endl;
for (map<string, string>::const_iterator iter = compilationDefines.begin(); iter != compilationDefines.end(); ++iter) {
src << "#define " << iter->first;
if (!iter->second.empty())
src << " " << iter->second;
src << endl;
}
if (!compilationDefines.empty())
src << endl;
for (map<string, string>::const_iterator iter = defines.begin(); iter != defines.end(); ++iter) {
src << "#define " << iter->first;
if (!iter->second.empty())
src << " " << iter->second;
src << endl;
}
if (!defines.empty())
src << endl;
src << source << endl;
// Get length before using c_str() to avoid length() call invalidating the c_str() value.
string src_string = src.str();
::size_t src_length = src_string.length();
cl::Program::Sources sources(1, make_pair(src_string.c_str(), src_length));
cl::Program program(context, sources); cl::Program program(context, sources);
stringstream options;
options << compilationOptions;
if (optimizationFlags == NULL)
options << " " << defaultOptimizationOptions;
else
options << " " << optimizationFlags;
for (map<string, string>::const_iterator iter = defines.begin(); iter != defines.end(); ++iter)
options << " -D" << iter->first << "=" << iter->second;
try { try {
program.build(vector<cl::Device>(1, device), options.str().c_str()); program.build(vector<cl::Device>(1, device), options.c_str());
} catch (cl::Error err) { } catch (cl::Error err) {
throw OpenMMException("Error compiling kernel: "+program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device)); throw OpenMMException("Error compiling kernel: "+program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device));
} }
......
#ifndef OPENMM_OPENCLCONTEXT_H_ #ifndef OPENMM_OPENCLCONTEXT_H_
#define OPENMM_OPENCLCONTEXT_H_ #define OPENMM_OPENCLCONTEXT_H_
/* -------------------------------------------------------------------------- * /* -------------------------------------------------------------------------- *
* OpenMM * * OpenMM *
* -------------------------------------------------------------------------- * * -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from * * This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of * * Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2009 Stanford University and the Authors. * * Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
* This program is free software: you can redistribute it and/or modify * * This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published * * it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or * * by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. * * (at your option) any later version. *
* * * *
* This program is distributed in the hope that it will be useful, * * This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of * * but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. * * GNU Lesser General Public License for more details. *
* * * *
* You should have received a copy of the GNU Lesser General Public License * * You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. * * along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */ * -------------------------------------------------------------------------- */
#include <map> #include <map>
#include <queue> #include <queue>
#include <string> #include <string>
#include <pthread.h> #include <pthread.h>
#define __CL_ENABLE_EXCEPTIONS #define __CL_ENABLE_EXCEPTIONS
#ifdef _MSC_VER #ifdef _MSC_VER
// Prevent Windows from defining macros that interfere with other code. // Prevent Windows from defining macros that interfere with other code.
#define NOMINMAX #define NOMINMAX
#endif #endif
#include <cl.hpp> #include <cl.hpp>
#include "openmm/internal/windowsExport.h" #include "openmm/internal/windowsExport.h"
#include "OpenCLPlatform.h" #include "OpenCLPlatform.h"
namespace OpenMM { namespace OpenMM {
template <class T> template <class T>
class OpenCLArray; class OpenCLArray;
class OpenCLForceInfo; class OpenCLForceInfo;
class OpenCLIntegrationUtilities; class OpenCLIntegrationUtilities;
class OpenCLNonbondedUtilities; class OpenCLNonbondedUtilities;
class System; class System;
/** /**
* We can't use predefined vector types like cl_float4, since different OpenCL implementations currently define * We can't use predefined vector types like cl_float4, since different OpenCL implementations currently define
* them in incompatible ways. Hopefully that will be fixed in the future. In the mean time, we define our own * them in incompatible ways. Hopefully that will be fixed in the future. In the mean time, we define our own
* types to represent them on the host. * types to represent them on the host.
*/ */
struct mm_float2 { struct mm_float2 {
cl_float x, y; cl_float x, y;
mm_float2() { mm_float2() {
} }
mm_float2(cl_float x, cl_float y) : x(x), y(y) { mm_float2(cl_float x, cl_float y) : x(x), y(y) {
} }
}; };
struct mm_float4 { struct mm_float4 {
cl_float x, y, z, w; cl_float x, y, z, w;
mm_float4() { mm_float4() {
} }
mm_float4(cl_float x, cl_float y, cl_float z, cl_float w) : x(x), y(y), z(z), w(w) { mm_float4(cl_float x, cl_float y, cl_float z, cl_float w) : x(x), y(y), z(z), w(w) {
} }
}; };
struct mm_float8 { struct mm_float8 {
cl_float s0, s1, s2, s3, s4, s5, s6, s7; cl_float s0, s1, s2, s3, s4, s5, s6, s7;
mm_float8() { mm_float8() {
} }
mm_float8(cl_float s0, cl_float s1, cl_float s2, cl_float s3, cl_float s4, cl_float s5, cl_float s6, cl_float s7) : mm_float8(cl_float s0, cl_float s1, cl_float s2, cl_float s3, cl_float s4, cl_float s5, cl_float s6, cl_float s7) :
s0(s0), s1(s1), s2(s2), s3(s3), s4(s4), s5(s5), s6(s6), s7(s7) { s0(s0), s1(s1), s2(s2), s3(s3), s4(s4), s5(s5), s6(s6), s7(s7) {
} }
}; };
struct mm_float16 { struct mm_float16 {
cl_float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; cl_float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
mm_float16() { mm_float16() {
} }
mm_float16(cl_float s0, cl_float s1, cl_float s2, cl_float s3, cl_float s4, cl_float s5, cl_float s6, cl_float s7, mm_float16(cl_float s0, cl_float s1, cl_float s2, cl_float s3, cl_float s4, cl_float s5, cl_float s6, cl_float s7,
cl_float s8, cl_float s9, cl_float s10, cl_float s11, cl_float s12, cl_float s13, cl_float s14, cl_float s15) : cl_float s8, cl_float s9, cl_float s10, cl_float s11, cl_float s12, cl_float s13, cl_float s14, cl_float s15) :
s0(s0), s1(s1), s2(s2), s3(s3), s4(s4), s5(s5), s6(s6), s7(s7), s0(s0), s1(s1), s2(s2), s3(s3), s4(s4), s5(s5), s6(s6), s7(s7),
s8(s8), s9(s9), s10(s10), s11(s11), s12(s12), s13(s13), s14(s14), s15(15) { s8(s8), s9(s9), s10(s10), s11(s11), s12(s12), s13(s13), s14(s14), s15(15) {
} }
}; };
struct mm_ushort2 { struct mm_ushort2 {
cl_ushort x, y; cl_ushort x, y;
mm_ushort2() { mm_ushort2() {
} }
mm_ushort2(cl_ushort x, cl_ushort y) : x(x), y(y) { mm_ushort2(cl_ushort x, cl_ushort y) : x(x), y(y) {
} }
}; };
struct mm_int2 { struct mm_int2 {
cl_int x, y; cl_int x, y;
mm_int2() { mm_int2() {
} }
mm_int2(cl_int x, cl_int y) : x(x), y(y) { mm_int2(cl_int x, cl_int y) : x(x), y(y) {
} }
}; };
struct mm_int4 { struct mm_int4 {
cl_int x, y, z, w; cl_int x, y, z, w;
mm_int4() { mm_int4() {
} }
mm_int4(cl_int x, cl_int y, cl_int z, cl_int w) : x(x), y(y), z(z), w(w) { mm_int4(cl_int x, cl_int y, cl_int z, cl_int w) : x(x), y(y), z(z), w(w) {
} }
}; };
struct mm_int8 { struct mm_int8 {
cl_int s0, s1, s2, s3, s4, s5, s6, s7; cl_int s0, s1, s2, s3, s4, s5, s6, s7;
mm_int8() { mm_int8() {
} }
mm_int8(cl_int s0, cl_int s1, cl_int s2, cl_int s3, cl_int s4, cl_int s5, cl_int s6, cl_int s7) : mm_int8(cl_int s0, cl_int s1, cl_int s2, cl_int s3, cl_int s4, cl_int s5, cl_int s6, cl_int s7) :
s0(s0), s1(s1), s2(s2), s3(s3), s4(s4), s5(s5), s6(s6), s7(s7) { s0(s0), s1(s1), s2(s2), s3(s3), s4(s4), s5(s5), s6(s6), s7(s7) {
} }
}; };
struct mm_int16 { struct mm_int16 {
cl_int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; cl_int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
mm_int16() { mm_int16() {
} }
mm_int16(cl_int s0, cl_int s1, cl_int s2, cl_int s3, cl_int s4, cl_int s5, cl_int s6, cl_int s7, mm_int16(cl_int s0, cl_int s1, cl_int s2, cl_int s3, cl_int s4, cl_int s5, cl_int s6, cl_int s7,
cl_int s8, cl_int s9, cl_int s10, cl_int s11, cl_int s12, cl_int s13, cl_int s14, cl_int s15) : cl_int s8, cl_int s9, cl_int s10, cl_int s11, cl_int s12, cl_int s13, cl_int s14, cl_int s15) :
s0(s0), s1(s1), s2(s2), s3(s3), s4(s4), s5(s5), s6(s6), s7(s7), s0(s0), s1(s1), s2(s2), s3(s3), s4(s4), s5(s5), s6(s6), s7(s7),
s8(s8), s9(s9), s10(s10), s11(s11), s12(s12), s13(s13), s14(s14), s15(15) { s8(s8), s9(s9), s10(s10), s11(s11), s12(s12), s13(s13), s14(s14), s15(15) {
} }
}; };
/** /**
* This class contains the information associated with a Context by the OpenCL Platform. Each OpenCLContext is * This class contains the information associated with a Context by the OpenCL Platform. Each OpenCLContext is
* specific to a particular device, and manages data structures and kernels for that device. When running a simulation * specific to a particular device, and manages data structures and kernels for that device. When running a simulation
* in parallel on multiple devices, there is a separate OpenCLContext for each one. The list of all contexts is * in parallel on multiple devices, there is a separate OpenCLContext for each one. The list of all contexts is
* stored in the OpenCLPlatform::PlatformData. * stored in the OpenCLPlatform::PlatformData.
* <p> * <p>
* In addition, a worker thread is created for each OpenCLContext. This is used for parallel computations, so that * In addition, a worker thread is created for each OpenCLContext. This is used for parallel computations, so that
* blocking calls to one device will not block other devices. When only a single device is being used, the worker * blocking calls to one device will not block other devices. When only a single device is being used, the worker
* thread is not used and calculations are performed on the main application thread. * thread is not used and calculations are performed on the main application thread.
*/ */
class OPENMM_EXPORT OpenCLContext { class OPENMM_EXPORT OpenCLContext {
public: public:
class WorkTask; class WorkTask;
class WorkThread; class WorkThread;
static const int ThreadBlockSize; static const int ThreadBlockSize;
static const int TileSize; static const int TileSize;
OpenCLContext(int numParticles, int deviceIndex, OpenCLPlatform::PlatformData& platformData); OpenCLContext(int numParticles, int deviceIndex, OpenCLPlatform::PlatformData& platformData);
~OpenCLContext(); ~OpenCLContext();
/** /**
* This is called to initialize internal data structures after all Forces in the system * This is called to initialize internal data structures after all Forces in the system
* have been initialized. * have been initialized.
*/ */
void initialize(const System& system); void initialize(const System& system);
/** /**
* Add an OpenCLForce to this context. * Add an OpenCLForce to this context.
*/ */
void addForce(OpenCLForceInfo* force); void addForce(OpenCLForceInfo* force);
/** /**
* Get the cl::Context associated with this object. * Get the cl::Context associated with this object.
*/ */
cl::Context& getContext() { cl::Context& getContext() {
return context; return context;
} }
/** /**
* Get the cl::Device associated with this object. * Get the cl::Device associated with this object.
*/ */
cl::Device& getDevice() { cl::Device& getDevice() {
return device; return device;
} }
/** /**
* Get the index of the cl::Device associated with this object. * Get the index of the cl::Device associated with this object.
*/ */
int getDeviceIndex() { int getDeviceIndex() {
return deviceIndex; return deviceIndex;
} }
/** /**
* Get the PlatformData object this context is part of. * Get the PlatformData object this context is part of.
*/ */
OpenCLPlatform::PlatformData& getPlatformData() { OpenCLPlatform::PlatformData& getPlatformData() {
return platformData; return platformData;
} }
/** /**
* Get the index of this context in the list stored in the PlatformData. * Get the index of this context in the list stored in the PlatformData.
*/ */
int getContextIndex() const { int getContextIndex() const {
return contextIndex; return contextIndex;
} }
/** /**
* Get the cl::CommandQueue associated with this object. * Get the cl::CommandQueue associated with this object.
*/ */
cl::CommandQueue& getQueue() { cl::CommandQueue& getQueue() {
return queue; return queue;
} }
/** /**
* Get the array which contains the position (the xyz components) and charge (the w component) of each atom. * Get the array which contains the position (the xyz components) and charge (the w component) of each atom.
*/ */
OpenCLArray<mm_float4>& getPosq() { OpenCLArray<mm_float4>& getPosq() {
return *posq; return *posq;
} }
/** /**
* Get the array which contains the velocity (the xyz components) and inverse mass (the w component) of each atom. * Get the array which contains the velocity (the xyz components) and inverse mass (the w component) of each atom.
*/ */
OpenCLArray<mm_float4>& getVelm() { OpenCLArray<mm_float4>& getVelm() {
return *velm; return *velm;
} }
/** /**
* Get the array which contains the force on each atom. * Get the array which contains the force on each atom.
*/ */
OpenCLArray<mm_float4>& getForce() { OpenCLArray<mm_float4>& getForce() {
return *force; return *force;
} }
/** /**
* Get the array which contains the buffers in which forces are computed. * Get the array which contains the buffers in which forces are computed.
*/ */
OpenCLArray<mm_float4>& getForceBuffers() { OpenCLArray<mm_float4>& getForceBuffers() {
return *forceBuffers; return *forceBuffers;
} }
/** /**
* Get the array which contains the buffer in which energy is computed. * Get the array which contains the buffer in which energy is computed.
*/ */
OpenCLArray<cl_float>& getEnergyBuffer() { OpenCLArray<cl_float>& getEnergyBuffer() {
return *energyBuffer; return *energyBuffer;
} }
/** /**
* Get the array which contains the index of each atom. * Get the array which contains the index of each atom.
*/ */
OpenCLArray<cl_int>& getAtomIndex() { OpenCLArray<cl_int>& getAtomIndex() {
return *atomIndex; return *atomIndex;
} }
/** /**
* Get the number of cells by which the positions are offset. * Get the number of cells by which the positions are offset.
*/ */
std::vector<mm_int4>& getPosCellOffsets() { std::vector<mm_int4>& getPosCellOffsets() {
return posCellOffsets; return posCellOffsets;
} }
/** /**
* Load OpenCL source code from a file in the kernels directory. * Load OpenCL source code from a file in the kernels directory.
*/ */
std::string loadSourceFromFile(const std::string& filename) const; std::string loadSourceFromFile(const std::string& filename) const;
/** /**
* Load OpenCL source code from a file in the kernels directory. * Load OpenCL source code from a file in the kernels directory.
* *
* @param filename the file to load * @param filename the file to load
* @param replacements a set of strings that should be replaced with new strings wherever they appear in the * @param replacements a set of strings that should be replaced with new strings wherever they appear in the
*/ */
std::string loadSourceFromFile(const std::string& filename, const std::map<std::string, std::string>& replacements) const; std::string loadSourceFromFile(const std::string& filename, const std::map<std::string, std::string>& replacements) const;
/** /**
* Replace all occurance of a list of substrings. * Replace all occurance of a list of substrings.
* *
* @param input a string to process * @param input a string to process
* @param replacements a set of strings that should be replaced with new strings wherever they appear in the input string * @param replacements a set of strings that should be replaced with new strings wherever they appear in the input string
* @return a new string produced by performing the replacements * @return a new string produced by performing the replacements
*/ */
std::string replaceStrings(const std::string& input, const std::map<std::string, std::string>& replacements) const; std::string replaceStrings(const std::string& input, const std::map<std::string, std::string>& replacements) const;
/** /**
* Create an OpenCL Program from source code. * Create an OpenCL Program from source code.
* *
* @param source the source code of the program * @param source the source code of the program
* @param optimizationFlags the optimization flags to pass to the OpenCL compiler. If this is * @param optimizationFlags the optimization flags to pass to the OpenCL compiler. If this is
* omitted, a default set of options will be used * omitted, a default set of options will be used
*/ */
cl::Program createProgram(const std::string source, const char* optimizationFlags = NULL); cl::Program createProgram(const std::string source, const char* optimizationFlags = NULL);
/** /**
* Create an OpenCL Program from source code. * Create an OpenCL Program from source code.
* *
* @param source the source code of the program * @param source the source code of the program
* @param defines a set of preprocessor definitions (name, value) to define when compiling the program * @param defines a set of preprocessor definitions (name, value) to define when compiling the program
* @param optimizationFlags the optimization flags to pass to the OpenCL compiler. If this is * @param optimizationFlags the optimization flags to pass to the OpenCL compiler. If this is
* omitted, a default set of options will be used * omitted, a default set of options will be used
*/ */
cl::Program createProgram(const std::string source, const std::map<std::string, std::string>& defines, const char* optimizationFlags = NULL); cl::Program createProgram(const std::string source, const std::map<std::string, std::string>& defines, const char* optimizationFlags = NULL);
/** /**
* Execute a kernel. * Execute a kernel.
* *
* @param kernel the kernel to execute * @param kernel the kernel to execute
* @param workUnits the maximum number of work units that should be used * @param workUnits the maximum number of work units that should be used
* @param blockSize the size of each thread block to use * @param blockSize the size of each thread block to use
*/ */
void executeKernel(cl::Kernel& kernel, int workUnits, int blockSize = -1); void executeKernel(cl::Kernel& kernel, int workUnits, int blockSize = -1);
/** /**
* Set all elements of an array to 0. * Set all elements of an array to 0.
*/ */
void clearBuffer(OpenCLArray<float>& array); void clearBuffer(OpenCLArray<float>& array);
/** /**
* Set all elements of an array to 0. * Set all elements of an array to 0.
*/ */
void clearBuffer(OpenCLArray<mm_float4>& array); void clearBuffer(OpenCLArray<mm_float4>& array);
/** /**
* Set all elements of an array to 0. * Set all elements of an array to 0.
* *
* @param memory the Memory to clear * @param memory the Memory to clear
* @param size the number of float elements in the buffer * @param size the number of float elements in the buffer
*/ */
void clearBuffer(cl::Memory& memory, int size); void clearBuffer(cl::Memory& memory, int size);
/** /**
* Register a buffer that should be automatically cleared (all elements set to 0) at the start of each force or energy computation. * Register a buffer that should be automatically cleared (all elements set to 0) at the start of each force or energy computation.
* *
* @param memory the Memory to clear * @param memory the Memory to clear
* @param size the number of float elements in the buffer * @param size the number of float elements in the buffer
*/ */
void addAutoclearBuffer(cl::Memory& memory, int size); void addAutoclearBuffer(cl::Memory& memory, int size);
/** /**
* Clear all buffers that have been registered with addAutoclearBuffer(). * Clear all buffers that have been registered with addAutoclearBuffer().
*/ */
void clearAutoclearBuffers(); void clearAutoclearBuffers();
/** /**
* Given a collection of buffers packed into an array, sum them and store * Given a collection of buffers packed into an array, sum them and store
* the sum in the first buffer. * the sum in the first buffer.
* *
* @param array the array containing the buffers to reduce * @param array the array containing the buffers to reduce
* @param numBuffers the number of buffers packed into the array * @param numBuffers the number of buffers packed into the array
*/ */
void reduceBuffer(OpenCLArray<mm_float4>& array, int numBuffers); void reduceBuffer(OpenCLArray<mm_float4>& array, int numBuffers);
/** /**
* Get the current simulation time. * Get the current simulation time.
*/ */
double getTime() { double getTime() {
return time; return time;
} }
/** /**
* Set the current simulation time. * Set the current simulation time.
*/ */
void setTime(double t) { void setTime(double t) {
time = t; time = t;
} }
/** /**
* Get the number of integration steps that have been taken. * Get the number of integration steps that have been taken.
*/ */
int getStepCount() { int getStepCount() {
return stepCount; return stepCount;
} }
/** /**
* Set the number of integration steps that have been taken. * Set the number of integration steps that have been taken.
*/ */
void setStepCount(int steps) { void setStepCount(int steps) {
stepCount = steps; stepCount = steps;
} }
/** /**
* Get the number of times forces or energy has been computed. * Get the number of times forces or energy has been computed.
*/ */
int getComputeForceCount() { int getComputeForceCount() {
return computeForceCount; return computeForceCount;
} }
/** /**
* Set the number of times forces or energy has been computed. * Set the number of times forces or energy has been computed.
*/ */
void setComputeForceCount(int count) { void setComputeForceCount(int count) {
computeForceCount = count; computeForceCount = count;
} }
/** /**
* Get the number of atoms. * Get the number of atoms.
*/ */
int getNumAtoms() const { int getNumAtoms() const {
return numAtoms; return numAtoms;
} }
/** /**
* Get the number of atoms, rounded up to a multiple of TileSize. This is the actual size of * Get the number of atoms, rounded up to a multiple of TileSize. This is the actual size of
* most arrays with one element per atom. * most arrays with one element per atom.
*/ */
int getPaddedNumAtoms() const { int getPaddedNumAtoms() const {
return paddedNumAtoms; return paddedNumAtoms;
} }
/** /**
* Get the number of blocks of TileSize atoms. * Get the number of blocks of TileSize atoms.
*/ */
int getNumAtomBlocks() const { int getNumAtomBlocks() const {
return numAtomBlocks; return numAtomBlocks;
} }
/** /**
* Get the standard number of thread blocks to use when executing kernels. * Get the standard number of thread blocks to use when executing kernels.
*/ */
int getNumThreadBlocks() const { int getNumThreadBlocks() const {
return numThreadBlocks; return numThreadBlocks;
} }
/** /**
* Get the number of force buffers. * Get the number of force buffers.
*/ */
int getNumForceBuffers() const { int getNumForceBuffers() const {
return numForceBuffers; return numForceBuffers;
} }
/** /**
* Get the SIMD width of the device being used. * Get the SIMD width of the device being used.
*/ */
int getSIMDWidth() const { int getSIMDWidth() const {
return simdWidth; return simdWidth;
} }
/** /**
* Get whether the device being used supports 64 bit atomic operations on global memory. * Get whether the device being used supports 64 bit atomic operations on global memory.
*/ */
bool getSupports64BitGlobalAtomics() { bool getSupports64BitGlobalAtomics() {
return supports64BitGlobalAtomics; return supports64BitGlobalAtomics;
} }
/** /**
* Get the size of the periodic box. * Get the size of the periodic box.
*/ */
mm_float4 getPeriodicBoxSize() const { mm_float4 getPeriodicBoxSize() const {
return periodicBoxSize; return periodicBoxSize;
} }
/** /**
* Set the size of the periodic box. * Set the size of the periodic box.
*/ */
void setPeriodicBoxSize(double xsize, double ysize, double zsize) { void setPeriodicBoxSize(double xsize, double ysize, double zsize) {
periodicBoxSize = mm_float4((float) xsize, (float) ysize, (float) zsize, 0); periodicBoxSize = mm_float4((float) xsize, (float) ysize, (float) zsize, 0);
invPeriodicBoxSize = mm_float4((float) (1.0/xsize), (float) (1.0/ysize), (float) (1.0/zsize), 0); invPeriodicBoxSize = mm_float4((float) (1.0/xsize), (float) (1.0/ysize), (float) (1.0/zsize), 0);
} }
/** /**
* Get the inverse of the size of the periodic box. * Get the inverse of the size of the periodic box.
*/ */
mm_float4 getInvPeriodicBoxSize() const { mm_float4 getInvPeriodicBoxSize() const {
return invPeriodicBoxSize; return invPeriodicBoxSize;
} }
/** /**
* Get the OpenCLIntegrationUtilities for this context. * Get the OpenCLIntegrationUtilities for this context.
*/ */
OpenCLIntegrationUtilities& getIntegrationUtilities() { OpenCLIntegrationUtilities& getIntegrationUtilities() {
return *integration; return *integration;
} }
/** /**
* Get the OpenCLNonbondedUtilities for this context. * Get the OpenCLNonbondedUtilities for this context.
*/ */
OpenCLNonbondedUtilities& getNonbondedUtilities() { OpenCLNonbondedUtilities& getNonbondedUtilities() {
return *nonbonded; return *nonbonded;
} }
/** /**
* Get the thread used by this context for executing parallel computations. * Get the thread used by this context for executing parallel computations.
*/ */
WorkThread& getWorkThread() { WorkThread& getWorkThread() {
return *thread; return *thread;
} }
/** /**
* Reorder the internal arrays of atoms to try to keep spatially contiguous atoms close * Reorder the internal arrays of atoms to try to keep spatially contiguous atoms close
* together in the arrays. * together in the arrays.
*/ */
void reorderAtoms(); void reorderAtoms();
private: private:
struct Molecule; struct Molecule;
struct MoleculeGroup; struct MoleculeGroup;
void findMoleculeGroups(const System& system); void findMoleculeGroups(const System& system);
static void tagAtomsInMolecule(int atom, int molecule, std::vector<int>& atomMolecule, std::vector<std::vector<int> >& atomBonds); static void tagAtomsInMolecule(int atom, int molecule, std::vector<int>& atomMolecule, std::vector<std::vector<int> >& atomBonds);
double time; double time;
OpenCLPlatform::PlatformData& platformData; OpenCLPlatform::PlatformData& platformData;
int deviceIndex; int deviceIndex;
int contextIndex; int contextIndex;
int stepCount; int stepCount;
int computeForceCount; int computeForceCount;
int numAtoms; int numAtoms;
int paddedNumAtoms; int paddedNumAtoms;
int numAtomBlocks; int numAtomBlocks;
int numThreadBlocks; int numThreadBlocks;
int numForceBuffers; int numForceBuffers;
int simdWidth; int simdWidth;
bool supports64BitGlobalAtomics; bool supports64BitGlobalAtomics;
mm_float4 periodicBoxSize; mm_float4 periodicBoxSize;
mm_float4 invPeriodicBoxSize; mm_float4 invPeriodicBoxSize;
std::string compilationOptions, defaultOptimizationOptions; std::string defaultOptimizationOptions;
cl::Context context; std::map<std::string, std::string> compilationDefines;
cl::Device device; cl::Context context;
cl::CommandQueue queue; cl::Device device;
cl::Program utilities; cl::CommandQueue queue;
cl::Kernel clearBufferKernel; cl::Program utilities;
cl::Kernel clearTwoBuffersKernel; cl::Kernel clearBufferKernel;
cl::Kernel clearThreeBuffersKernel; cl::Kernel clearTwoBuffersKernel;
cl::Kernel clearFourBuffersKernel; cl::Kernel clearThreeBuffersKernel;
cl::Kernel reduceFloat4Kernel; cl::Kernel clearFourBuffersKernel;
std::vector<OpenCLForceInfo*> forces; cl::Kernel reduceFloat4Kernel;
std::vector<MoleculeGroup> moleculeGroups; std::vector<OpenCLForceInfo*> forces;
std::vector<mm_int4> posCellOffsets; std::vector<MoleculeGroup> moleculeGroups;
OpenCLArray<mm_float4>* posq; std::vector<mm_int4> posCellOffsets;
OpenCLArray<mm_float4>* velm; OpenCLArray<mm_float4>* posq;
OpenCLArray<mm_float4>* force; OpenCLArray<mm_float4>* velm;
OpenCLArray<mm_float4>* forceBuffers; OpenCLArray<mm_float4>* force;
OpenCLArray<cl_float>* energyBuffer; OpenCLArray<mm_float4>* forceBuffers;
OpenCLArray<cl_int>* atomIndex; OpenCLArray<cl_float>* energyBuffer;
std::vector<cl::Memory*> autoclearBuffers; OpenCLArray<cl_int>* atomIndex;
std::vector<int> autoclearBufferSizes; std::vector<cl::Memory*> autoclearBuffers;
OpenCLIntegrationUtilities* integration; std::vector<int> autoclearBufferSizes;
OpenCLNonbondedUtilities* nonbonded; OpenCLIntegrationUtilities* integration;
WorkThread* thread; OpenCLNonbondedUtilities* nonbonded;
}; WorkThread* thread;
};
struct OpenCLContext::MoleculeGroup {
std::vector<int> atoms; struct OpenCLContext::MoleculeGroup {
std::vector<int> instances; std::vector<int> atoms;
}; std::vector<int> instances;
};
/**
* This abstract class defines a task to be executed on the worker thread. /**
*/ * This abstract class defines a task to be executed on the worker thread.
class OpenCLContext::WorkTask { */
public: class OpenCLContext::WorkTask {
virtual void execute() = 0; public:
}; virtual void execute() = 0;
};
class OpenCLContext::WorkThread {
public: class OpenCLContext::WorkThread {
struct ThreadData; public:
WorkThread(); struct ThreadData;
~WorkThread(); WorkThread();
/** ~WorkThread();
* Request that a task be executed on the worker thread. The argument should have been allocated on the /**
* heap with the "new" operator. After its execute() method finishes, the object will be deleted automatically. * Request that a task be executed on the worker thread. The argument should have been allocated on the
*/ * heap with the "new" operator. After its execute() method finishes, the object will be deleted automatically.
void addTask(OpenCLContext::WorkTask* task); */
/** void addTask(OpenCLContext::WorkTask* task);
* Get whether the worker thread is idle, waiting for a task to be added. /**
*/ * Get whether the worker thread is idle, waiting for a task to be added.
bool isWaiting(); */
/** bool isWaiting();
* Get whether the worker thread has exited. /**
*/ * Get whether the worker thread has exited.
bool isFinished(); */
/** bool isFinished();
* Block until all tasks have finished executing and the worker thread is idle. /**
*/ * Block until all tasks have finished executing and the worker thread is idle.
void flush(); */
private: void flush();
std::queue<OpenCLContext::WorkTask*> tasks; private:
bool waiting, finished; std::queue<OpenCLContext::WorkTask*> tasks;
pthread_mutex_t queueLock; bool waiting, finished;
pthread_cond_t waitForTaskCondition, queueEmptyCondition; pthread_mutex_t queueLock;
pthread_t thread; pthread_cond_t waitForTaskCondition, queueEmptyCondition;
}; pthread_t thread;
};
} // namespace OpenMM
} // namespace OpenMM
#endif /*OPENMM_OPENCLCONTEXT_H_*/
#endif /*OPENMM_OPENCLCONTEXT_H_*/
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment