/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2019 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see . *
* -------------------------------------------------------------------------- */
#ifdef WIN32
#define _USE_MATH_DEFINES // Needed to get M_PI
#endif
#include
#include "OpenCLContext.h"
#include "OpenCLArray.h"
#include "OpenCLBondedUtilities.h"
#include "OpenCLEvent.h"
#include "OpenCLForceInfo.h"
#include "OpenCLIntegrationUtilities.h"
#include "OpenCLKernelSources.h"
#include "OpenCLNonbondedUtilities.h"
#include "OpenCLProgram.h"
#include "openmm/common/ComputeArray.h"
#include "openmm/Platform.h"
#include "openmm/System.h"
#include "openmm/VirtualSite.h"
#include "openmm/internal/ContextImpl.h"
#include
#include
#include
#include
#include
#include
using namespace OpenMM;
using namespace std;
#ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000
#endif
#ifndef CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV
#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001
#endif
const int OpenCLContext::ThreadBlockSize = 64;
const int OpenCLContext::TileSize = 32;
static void CL_CALLBACK errorCallback(const char* errinfo, const void* private_info, size_t cb, void* user_data) {
string skip = "OpenCL Build Warning : Compiler build log:";
if (strncmp(errinfo, skip.c_str(), skip.length()) == 0)
return; // OS X Lion insists on calling this for every build warning, even though they aren't errors.
std::cerr << "OpenCL internal error: " << errinfo << std::endl;
}
OpenCLContext::OpenCLContext(const System& system, int platformIndex, int deviceIndex, const string& precision, OpenCLPlatform::PlatformData& platformData, OpenCLContext* originalContext) :
ComputeContext(system), platformData(platformData), numForceBuffers(0), hasAssignedPosqCharges(false),
integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL) {
if (precision == "single") {
useDoublePrecision = false;
useMixedPrecision = false;
}
else if (precision == "mixed") {
useDoublePrecision = false;
useMixedPrecision = true;
}
else if (precision == "double") {
useDoublePrecision = true;
useMixedPrecision = false;
}
else
throw OpenMMException("Illegal value for Precision: "+precision);
try {
contextIndex = platformData.contexts.size();
std::vector platforms;
cl::Platform::get(&platforms);
if (platformIndex < -1 || platformIndex >= (int) platforms.size())
throw OpenMMException("Illegal value for OpenCLPlatformIndex: "+intToString(platformIndex));
const int minThreadBlockSize = 32;
int bestSpeed = -1;
int bestDevice = -1;
int bestPlatform = -1;
for (int j = 0; j < platforms.size(); j++) {
// If they supplied a valid platformIndex, we only look through that platform
if (j != platformIndex && platformIndex != -1)
continue;
string platformVendor = platforms[j].getInfo();
vector devices;
platforms[j].getDevices(CL_DEVICE_TYPE_ALL, &devices);
if (deviceIndex < -1 || deviceIndex >= (int) devices.size())
throw OpenMMException("Illegal value for DeviceIndex: "+intToString(deviceIndex));
for (int i = 0; i < (int) devices.size(); i++) {
// If they supplied a valid deviceIndex, we only look through that one
if (i != deviceIndex && deviceIndex != -1)
continue;
if (platformVendor == "Apple" && (devices[i].getInfo() == CL_DEVICE_TYPE_CPU))
continue; // The CPU device on OS X won't work correctly.
if (useMixedPrecision || useDoublePrecision) {
bool supportsDouble = (devices[i].getInfo().find("cl_khr_fp64") != string::npos);
if (!supportsDouble)
continue; // This device does not support double precision.
}
int maxSize = devices[i].getInfo()[0];
int processingElementsPerComputeUnit = 8;
if (devices[i].getInfo() != CL_DEVICE_TYPE_GPU) {
processingElementsPerComputeUnit = 1;
}
else if (devices[i].getInfo().find("cl_nv_device_attribute_query") != string::npos) {
cl_uint computeCapabilityMajor;
clGetDeviceInfo(devices[i](), CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, sizeof(cl_uint), &computeCapabilityMajor, NULL);
processingElementsPerComputeUnit = (computeCapabilityMajor < 2 ? 8 : 32);
}
else if (devices[i].getInfo().find("cl_amd_device_attribute_query") != string::npos) {
// This attribute does not ensure that all queries are supported by the runtime (it may be an older runtime,
// or the CPU device) so still have to check for errors.
try {
processingElementsPerComputeUnit =
// AMD GPUs either have a single VLIW SIMD or multiple scalar SIMDs.
// The SIMD width is the number of threads the SIMD executes per cycle.
// This will be less than the wavefront width since it takes several
// cycles to execute the full wavefront.
// The SIMD instruction width is the VLIW instruction width (or 1 for scalar),
// this is the number of ALUs that can be executing per instruction per thread.
devices[i].getInfo() *
devices[i].getInfo() *
devices[i].getInfo();
// Just in case any of the queries return 0.
if (processingElementsPerComputeUnit <= 0)
processingElementsPerComputeUnit = 1;
}
catch (cl::Error err) {
// Runtime does not support the queries so use default.
}
}
int speed = devices[i].getInfo()*processingElementsPerComputeUnit*devices[i].getInfo();
if (maxSize >= minThreadBlockSize && speed > bestSpeed) {
bestDevice = i;
bestSpeed = speed;
bestPlatform = j;
}
}
}
if (bestPlatform == -1)
throw OpenMMException("No compatible OpenCL platform is available");
if (bestDevice == -1)
throw OpenMMException("No compatible OpenCL device is available");
vector devices;
platforms[bestPlatform].getDevices(CL_DEVICE_TYPE_ALL, &devices);
string platformVendor = platforms[bestPlatform].getInfo();
device = devices[bestDevice];
this->deviceIndex = bestDevice;
this->platformIndex = bestPlatform;
if (device.getInfo() < minThreadBlockSize)
throw OpenMMException("The specified OpenCL device is not compatible with OpenMM");
compilationDefines["WORK_GROUP_SIZE"] = intToString(ThreadBlockSize);
if (platformVendor.size() >= 5 && platformVendor.substr(0, 5) == "Intel")
defaultOptimizationOptions = "";
else
defaultOptimizationOptions = "-cl-mad-enable -cl-no-signed-zeros";
supports64BitGlobalAtomics = (device.getInfo().find("cl_khr_int64_base_atomics") != string::npos);
supportsDoublePrecision = (device.getInfo().find("cl_khr_fp64") != string::npos);
if ((useDoublePrecision || useMixedPrecision) && !supportsDoublePrecision)
throw OpenMMException("This device does not support double precision");
string vendor = device.getInfo();
int numThreadBlocksPerComputeUnit = 6;
if (vendor.size() >= 6 && vendor.substr(0, 6) == "NVIDIA") {
compilationDefines["WARPS_ARE_ATOMIC"] = "";
simdWidth = 32;
if (device.getInfo().find("cl_nv_device_attribute_query") != string::npos) {
// Compute level 1.2 and later Nvidia GPUs support 64 bit atomics, even though they don't list the
// proper extension as supported. We only use them on compute level 2.0 or later, since they're very
// slow on earlier GPUs.
cl_uint computeCapabilityMajor;
clGetDeviceInfo(device(), CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, sizeof(cl_uint), &computeCapabilityMajor, NULL);
if (computeCapabilityMajor > 1)
supports64BitGlobalAtomics = true;
if (computeCapabilityMajor == 5) {
// Workaround for a bug in Maxwell on CUDA 6.x.
string platformVersion = platforms[bestPlatform].getInfo();
if (platformVersion.find("CUDA 6") != string::npos)
supports64BitGlobalAtomics = false;
}
}
}
else if (vendor.size() >= 28 && vendor.substr(0, 28) == "Advanced Micro Devices, Inc.") {
if (device.getInfo() != CL_DEVICE_TYPE_GPU) {
/// \todo Is 6 a good value for the OpenCL CPU device?
// numThreadBlocksPerComputeUnit = ?;
simdWidth = 1;
}
else {
bool amdPostSdk2_4 = false;
// Default to 1 which will use the default kernels.
simdWidth = 1;
if (device.getInfo().find("cl_amd_device_attribute_query") != string::npos) {
// This attribute does not ensure that all queries are supported by the runtime so still have to
// check for errors.
try {
// Must catch cl:Error as will fail if runtime does not support queries.
cl_uint simdPerComputeUnit = device.getInfo();
simdWidth = device.getInfo();
// If the GPU has multiple SIMDs per compute unit then it is uses the scalar instruction
// set instead of the VLIW instruction set. It therefore needs more thread blocks per
// compute unit to hide memory latency.
if (simdPerComputeUnit > 1) {
if (simdWidth == 32)
numThreadBlocksPerComputeUnit = 6*simdPerComputeUnit; // Navi seems to like more thread blocks than older GPUs
else
numThreadBlocksPerComputeUnit = 4*simdPerComputeUnit;
}
// If the queries are supported then must be newer than SDK 2.4.
amdPostSdk2_4 = true;
}
catch (cl::Error err) {
// Runtime does not support the query so is unlikely to be the newer scalar GPU.
// Stay with the default simdWidth and numThreadBlocksPerComputeUnit.
}
}
// AMD APP SDK 2.4 has a performance problem with atomics. Enable the work around. This is fixed after SDK 2.4.
if (!amdPostSdk2_4)
compilationDefines["AMD_ATOMIC_WORK_AROUND"] = "";
}
}
else
simdWidth = 1;
if (supports64BitGlobalAtomics)
compilationDefines["SUPPORTS_64_BIT_ATOMICS"] = "";
if (supportsDoublePrecision)
compilationDefines["SUPPORTS_DOUBLE_PRECISION"] = "";
if (simdWidth >= 32)
compilationDefines["SYNC_WARPS"] = "mem_fence(CLK_LOCAL_MEM_FENCE)";
else
compilationDefines["SYNC_WARPS"] = "barrier(CLK_LOCAL_MEM_FENCE)";
vector contextDevices;
contextDevices.push_back(device);
cl_context_properties cprops[] = {CL_CONTEXT_PLATFORM, (cl_context_properties) platforms[bestPlatform](), 0};
if (originalContext == NULL) {
context = cl::Context(contextDevices, cprops, errorCallback);
defaultQueue = cl::CommandQueue(context, device);
}
else {
context = originalContext->context;
defaultQueue = originalContext->defaultQueue;
}
currentQueue = defaultQueue;
numAtoms = system.getNumParticles();
paddedNumAtoms = TileSize*((numAtoms+TileSize-1)/TileSize);
numAtomBlocks = (paddedNumAtoms+(TileSize-1))/TileSize;
numThreadBlocks = numThreadBlocksPerComputeUnit*device.getInfo();
if (useDoublePrecision) {
posq.initialize(*this, paddedNumAtoms, "posq");
velm.initialize(*this, paddedNumAtoms, "velm");
compilationDefines["USE_DOUBLE_PRECISION"] = "1";
compilationDefines["convert_real4"] = "convert_double4";
compilationDefines["make_real2"] = "make_double2";
compilationDefines["make_real3"] = "make_double3";
compilationDefines["make_real4"] = "make_double4";
compilationDefines["convert_mixed4"] = "convert_double4";
compilationDefines["make_mixed2"] = "make_double2";
compilationDefines["make_mixed3"] = "make_double3";
compilationDefines["make_mixed4"] = "make_double4";
}
else if (useMixedPrecision) {
posq.initialize(*this, paddedNumAtoms, "posq");
posqCorrection.initialize(*this, paddedNumAtoms, "posq");
velm.initialize(*this, paddedNumAtoms, "velm");
compilationDefines["USE_MIXED_PRECISION"] = "1";
compilationDefines["convert_real4"] = "convert_float4";
compilationDefines["make_real2"] = "make_float2";
compilationDefines["make_real3"] = "make_float3";
compilationDefines["make_real4"] = "make_float4";
compilationDefines["convert_mixed4"] = "convert_double4";
compilationDefines["make_mixed2"] = "make_double2";
compilationDefines["make_mixed3"] = "make_double3";
compilationDefines["make_mixed4"] = "make_double4";
}
else {
posq.initialize(*this, paddedNumAtoms, "posq");
velm.initialize(*this, paddedNumAtoms, "velm");
compilationDefines["convert_real4"] = "convert_float4";
compilationDefines["make_real2"] = "make_float2";
compilationDefines["make_real3"] = "make_float3";
compilationDefines["make_real4"] = "make_float4";
compilationDefines["convert_mixed4"] = "convert_float4";
compilationDefines["make_mixed2"] = "make_float2";
compilationDefines["make_mixed3"] = "make_float3";
compilationDefines["make_mixed4"] = "make_float4";
}
longForceBuffer.initialize(*this, 3*paddedNumAtoms, "longForceBuffer");
posCellOffsets.resize(paddedNumAtoms, mm_int4(0, 0, 0, 0));
atomIndexDevice.initialize(*this, paddedNumAtoms, "atomIndexDevice");
atomIndex.resize(paddedNumAtoms);
for (int i = 0; i < paddedNumAtoms; ++i)
atomIndex[i] = i;
atomIndexDevice.upload(atomIndex);
}
catch (cl::Error err) {
std::stringstream str;
str<<"Error initializing context: "< values(valuesArray.getSize());
float nextValue = 1e-4f;
for (auto& val : values) {
val.s0 = nextValue;
nextValue *= (float) M_PI;
}
valuesArray.upload(values);
accuracyKernel.setArg(0, valuesArray.getDeviceBuffer());
accuracyKernel.setArg(1, values.size());
executeKernel(accuracyKernel, values.size());
valuesArray.download(values);
double maxSqrtError = 0.0, maxRsqrtError = 0.0, maxRecipError = 0.0, maxExpError = 0.0, maxLogError = 0.0;
for (auto& val : values) {
double v = val.s0;
double correctSqrt = sqrt(v);
maxSqrtError = max(maxSqrtError, fabs(correctSqrt-val.s1)/correctSqrt);
maxRsqrtError = max(maxRsqrtError, fabs(1.0/correctSqrt-val.s2)*correctSqrt);
maxRecipError = max(maxRecipError, fabs(1.0/v-val.s3)/val.s3);
maxExpError = max(maxExpError, fabs(exp(v)-val.s4)/val.s4);
maxLogError = max(maxLogError, fabs(log(v)-val.s5)/val.s5);
}
compilationDefines["SQRT"] = (maxSqrtError < 1e-6) ? "native_sqrt" : "sqrt";
compilationDefines["RSQRT"] = (maxRsqrtError < 1e-6) ? "native_rsqrt" : "rsqrt";
compilationDefines["RECIP"] = (maxRecipError < 1e-6) ? "native_recip" : "1.0f/";
compilationDefines["EXP"] = (maxExpError < 1e-6) ? "native_exp" : "exp";
compilationDefines["LOG"] = (maxLogError < 1e-6) ? "native_log" : "log";
}
else {
compilationDefines["SQRT"] = "sqrt";
compilationDefines["RSQRT"] = "rsqrt";
compilationDefines["RECIP"] = "1.0/";
compilationDefines["EXP"] = "exp";
compilationDefines["LOG"] = "log";
}
compilationDefines["POW"] = "pow";
compilationDefines["COS"] = "cos";
compilationDefines["SIN"] = "sin";
compilationDefines["TAN"] = "tan";
compilationDefines["ACOS"] = "acos";
compilationDefines["ASIN"] = "asin";
compilationDefines["ATAN"] = "atan";
// Set defines for applying periodic boundary conditions.
Vec3 boxVectors[3];
system.getDefaultPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
boxIsTriclinic = (boxVectors[0][1] != 0.0 || boxVectors[0][2] != 0.0 ||
boxVectors[1][0] != 0.0 || boxVectors[1][2] != 0.0 ||
boxVectors[2][0] != 0.0 || boxVectors[2][1] != 0.0);
if (boxIsTriclinic) {
compilationDefines["APPLY_PERIODIC_TO_DELTA(delta)"] =
"{"
"real scale3 = floor(delta.z*invPeriodicBoxSize.z+0.5f); \\\n"
"delta.xyz -= scale3*periodicBoxVecZ.xyz; \\\n"
"real scale2 = floor(delta.y*invPeriodicBoxSize.y+0.5f); \\\n"
"delta.xy -= scale2*periodicBoxVecY.xy; \\\n"
"real scale1 = floor(delta.x*invPeriodicBoxSize.x+0.5f); \\\n"
"delta.x -= scale1*periodicBoxVecX.x;}";
compilationDefines["APPLY_PERIODIC_TO_POS(pos)"] =
"{"
"real scale3 = floor(pos.z*invPeriodicBoxSize.z); \\\n"
"pos.xyz -= scale3*periodicBoxVecZ.xyz; \\\n"
"real scale2 = floor(pos.y*invPeriodicBoxSize.y); \\\n"
"pos.xy -= scale2*periodicBoxVecY.xy; \\\n"
"real scale1 = floor(pos.x*invPeriodicBoxSize.x); \\\n"
"pos.x -= scale1*periodicBoxVecX.x;}";
compilationDefines["APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, center)"] =
"{"
"real scale3 = floor((pos.z-center.z)*invPeriodicBoxSize.z+0.5f); \\\n"
"pos.x -= scale3*periodicBoxVecZ.x; \\\n"
"pos.y -= scale3*periodicBoxVecZ.y; \\\n"
"pos.z -= scale3*periodicBoxVecZ.z; \\\n"
"real scale2 = floor((pos.y-center.y)*invPeriodicBoxSize.y+0.5f); \\\n"
"pos.x -= scale2*periodicBoxVecY.x; \\\n"
"pos.y -= scale2*periodicBoxVecY.y; \\\n"
"real scale1 = floor((pos.x-center.x)*invPeriodicBoxSize.x+0.5f); \\\n"
"pos.x -= scale1*periodicBoxVecX.x;}";
}
else {
compilationDefines["APPLY_PERIODIC_TO_DELTA(delta)"] =
"delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;";
compilationDefines["APPLY_PERIODIC_TO_POS(pos)"] =
"pos.xyz -= floor(pos.xyz*invPeriodicBoxSize.xyz)*periodicBoxSize.xyz;";
compilationDefines["APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, center)"] =
"{"
"pos.x -= floor((pos.x-center.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; \\\n"
"pos.y -= floor((pos.y-center.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; \\\n"
"pos.z -= floor((pos.z-center.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;}";
}
// Create utilities objects.
bonded = new OpenCLBondedUtilities(*this);
nonbonded = new OpenCLNonbondedUtilities(*this);
integration = new OpenCLIntegrationUtilities(*this, system);
expression = new OpenCLExpressionUtilities(*this);
}
OpenCLContext::~OpenCLContext() {
for (auto force : forces)
delete force;
for (auto listener : reorderListeners)
delete listener;
for (auto computation : preComputations)
delete computation;
for (auto computation : postComputations)
delete computation;
if (pinnedBuffer != NULL)
delete pinnedBuffer;
if (integration != NULL)
delete integration;
if (expression != NULL)
delete expression;
if (bonded != NULL)
delete bonded;
if (nonbonded != NULL)
delete nonbonded;
}
void OpenCLContext::initialize() {
bonded->initialize(system);
numForceBuffers = std::max(numForceBuffers, (int) platformData.contexts.size());
numForceBuffers = std::max(numForceBuffers, bonded->getNumForceBuffers());
numForceBuffers = std::max(numForceBuffers, nonbonded->getNumForceBuffers());
int energyBufferSize = max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers());
if (useDoublePrecision) {
forceBuffers.initialize(*this, paddedNumAtoms*numForceBuffers, "forceBuffers");
force.initialize(*this, &forceBuffers.getDeviceBuffer(), paddedNumAtoms, "force");
energyBuffer.initialize(*this, energyBufferSize, "energyBuffer");
energySum.initialize(*this, 1, "energySum");
}
else if (useMixedPrecision) {
forceBuffers.initialize(*this, paddedNumAtoms*numForceBuffers, "forceBuffers");
force.initialize(*this, &forceBuffers.getDeviceBuffer(), paddedNumAtoms, "force");
energyBuffer.initialize(*this, energyBufferSize, "energyBuffer");
energySum.initialize(*this, 1, "energySum");
}
else {
forceBuffers.initialize(*this, paddedNumAtoms*numForceBuffers, "forceBuffers");
force.initialize(*this, &forceBuffers.getDeviceBuffer(), paddedNumAtoms, "force");
energyBuffer.initialize(*this, energyBufferSize, "energyBuffer");
energySum.initialize(*this, 1, "energySum");
}
reduceForcesKernel.setArg(0, longForceBuffer.getDeviceBuffer());
reduceForcesKernel.setArg(1, forceBuffers.getDeviceBuffer());
reduceForcesKernel.setArg(2, paddedNumAtoms);
reduceForcesKernel.setArg(3, numForceBuffers);
if (supports64BitGlobalAtomics)
addAutoclearBuffer(longForceBuffer);
addAutoclearBuffer(forceBuffers);
addAutoclearBuffer(energyBuffer);
int numEnergyParamDerivs = energyParamDerivNames.size();
if (numEnergyParamDerivs > 0) {
if (useDoublePrecision || useMixedPrecision)
energyParamDerivBuffer.initialize(*this, numEnergyParamDerivs*energyBufferSize, "energyParamDerivBuffer");
else
energyParamDerivBuffer.initialize(*this, numEnergyParamDerivs*energyBufferSize, "energyParamDerivBuffer");
addAutoclearBuffer(energyParamDerivBuffer);
}
int bufferBytes = max(velm.getSize()*velm.getElementSize(), energyBufferSize*energyBuffer.getElementSize());
pinnedBuffer = new cl::Buffer(context, CL_MEM_ALLOC_HOST_PTR, bufferBytes);
pinnedMemory = currentQueue.enqueueMapBuffer(*pinnedBuffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, bufferBytes);
for (int i = 0; i < numAtoms; i++) {
double mass = system.getParticleMass(i);
if (useDoublePrecision || useMixedPrecision)
((mm_double4*) pinnedMemory)[i] = mm_double4(0.0, 0.0, 0.0, mass == 0.0 ? 0.0 : 1.0/mass);
else
((mm_float4*) pinnedMemory)[i] = mm_float4(0.0f, 0.0f, 0.0f, mass == 0.0 ? 0.0f : (cl_float) (1.0/mass));
}
velm.upload(pinnedMemory);
findMoleculeGroups();
nonbonded->initialize(system);
}
void OpenCLContext::initializeContexts() {
getPlatformData().initializeContexts(system);
}
void OpenCLContext::addForce(ComputeForceInfo* force) {
ComputeContext::addForce(force);
OpenCLForceInfo* clinfo = dynamic_cast(force);
if (clinfo != NULL)
requestForceBuffers(clinfo->getRequiredForceBuffers());
}
void OpenCLContext::requestForceBuffers(int minBuffers) {
numForceBuffers = std::max(numForceBuffers, minBuffers);
}
cl::Program OpenCLContext::createProgram(const string source, const char* optimizationFlags) {
return createProgram(source, map(), optimizationFlags);
}
cl::Program OpenCLContext::createProgram(const string source, const map& defines, const char* optimizationFlags) {
string options = (optimizationFlags == NULL ? defaultOptimizationOptions : string(optimizationFlags));
stringstream src;
if (!options.empty())
src << "// Compilation Options: " << options << endl << endl;
for (auto& pair : compilationDefines) {
src << "#define " << pair.first;
if (!pair.second.empty())
src << " " << pair.second;
src << endl;
}
if (!compilationDefines.empty())
src << endl;
if (supportsDoublePrecision)
src << "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
if (useDoublePrecision) {
src << "typedef double real;\n";
src << "typedef double2 real2;\n";
src << "typedef double3 real3;\n";
src << "typedef double4 real4;\n";
}
else {
src << "typedef float real;\n";
src << "typedef float2 real2;\n";
src << "typedef float3 real3;\n";
src << "typedef float4 real4;\n";
}
if (useDoublePrecision || useMixedPrecision) {
src << "typedef double mixed;\n";
src << "typedef double2 mixed2;\n";
src << "typedef double3 mixed3;\n";
src << "typedef double4 mixed4;\n";
}
else {
src << "typedef float mixed;\n";
src << "typedef float2 mixed2;\n";
src << "typedef float3 mixed3;\n";
src << "typedef float4 mixed4;\n";
}
src << OpenCLKernelSources::common << endl;
for (auto& pair : defines) {
src << "#define " << pair.first;
if (!pair.second.empty())
src << " " << pair.second;
src << endl;
}
if (!defines.empty())
src << endl;
src << source << endl;
// Get length before using c_str() to avoid length() call invalidating the c_str() value.
string src_string = src.str();
::size_t src_length = src_string.length();
cl::Program::Sources sources(1, make_pair(src_string.c_str(), src_length));
cl::Program program(context, sources);
try {
program.build(vector(1, device), options.c_str());
} catch (cl::Error err) {
throw OpenMMException("Error compiling kernel: "+program.getBuildInfo(device));
}
return program;
}
cl::CommandQueue& OpenCLContext::getQueue() {
return currentQueue;
}
void OpenCLContext::setQueue(cl::CommandQueue& queue) {
currentQueue = queue;
}
void OpenCLContext::restoreDefaultQueue() {
currentQueue = defaultQueue;
}
OpenCLArray* OpenCLContext::createArray() {
return new OpenCLArray();
}
ComputeEvent OpenCLContext::createEvent() {
return shared_ptr(new OpenCLEvent(*this));
}
ComputeProgram OpenCLContext::compileProgram(const std::string source, const std::map& defines) {
cl::Program program = createProgram(source, defines);
return shared_ptr(new OpenCLProgram(*this, program));
}
OpenCLArray& OpenCLContext::unwrap(ArrayInterface& array) const {
OpenCLArray* clarray;
ComputeArray* wrapper = dynamic_cast(&array);
if (wrapper != NULL)
clarray = dynamic_cast(&wrapper->getArray());
else
clarray = dynamic_cast(&array);
if (clarray == NULL)
throw OpenMMException("Array argument is not an OpenCLArray");
return *clarray;
}
void OpenCLContext::executeKernel(cl::Kernel& kernel, int workUnits, int blockSize) {
if (blockSize == -1)
blockSize = ThreadBlockSize;
int size = std::min((workUnits+blockSize-1)/blockSize, numThreadBlocks)*blockSize;
try {
currentQueue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(size), cl::NDRange(blockSize));
}
catch (cl::Error err) {
stringstream str;
str<<"Error invoking kernel "<()<<": "<(0, memory);
clearBufferKernel.setArg(1, words);
executeKernel(clearBufferKernel, words, 128);
}
void OpenCLContext::addAutoclearBuffer(ArrayInterface& array) {
addAutoclearBuffer(unwrap(array).getDeviceBuffer(), array.getSize()*array.getElementSize());
}
void OpenCLContext::addAutoclearBuffer(cl::Memory& memory, int size) {
autoclearBuffers.push_back(&memory);
autoclearBufferSizes.push_back(size/4);
}
void OpenCLContext::clearAutoclearBuffers() {
int base = 0;
int total = autoclearBufferSizes.size();
while (total-base >= 6) {
clearSixBuffersKernel.setArg(0, *autoclearBuffers[base]);
clearSixBuffersKernel.setArg(1, autoclearBufferSizes[base]);
clearSixBuffersKernel.setArg(2, *autoclearBuffers[base+1]);
clearSixBuffersKernel.setArg(3, autoclearBufferSizes[base+1]);
clearSixBuffersKernel.setArg(4, *autoclearBuffers[base+2]);
clearSixBuffersKernel.setArg(5, autoclearBufferSizes[base+2]);
clearSixBuffersKernel.setArg(6, *autoclearBuffers[base+3]);
clearSixBuffersKernel.setArg(7, autoclearBufferSizes[base+3]);
clearSixBuffersKernel.setArg(8, *autoclearBuffers[base+4]);
clearSixBuffersKernel.setArg(9, autoclearBufferSizes[base+4]);
clearSixBuffersKernel.setArg(10, *autoclearBuffers[base+5]);
clearSixBuffersKernel.setArg(11, autoclearBufferSizes[base+5]);
executeKernel(clearSixBuffersKernel, max(max(max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), autoclearBufferSizes[base+4]), autoclearBufferSizes[base+5]), 128);
base += 6;
}
if (total-base == 5) {
clearFiveBuffersKernel.setArg(0, *autoclearBuffers[base]);
clearFiveBuffersKernel.setArg(1, autoclearBufferSizes[base]);
clearFiveBuffersKernel.setArg(2, *autoclearBuffers[base+1]);
clearFiveBuffersKernel.setArg(3, autoclearBufferSizes[base+1]);
clearFiveBuffersKernel.setArg(4, *autoclearBuffers[base+2]);
clearFiveBuffersKernel.setArg(5, autoclearBufferSizes[base+2]);
clearFiveBuffersKernel.setArg(6, *autoclearBuffers[base+3]);
clearFiveBuffersKernel.setArg(7, autoclearBufferSizes[base+3]);
clearFiveBuffersKernel.setArg(8, *autoclearBuffers[base+4]);
clearFiveBuffersKernel.setArg(9, autoclearBufferSizes[base+4]);
executeKernel(clearFiveBuffersKernel, max(max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), autoclearBufferSizes[base+4]), 128);
}
else if (total-base == 4) {
clearFourBuffersKernel.setArg(0, *autoclearBuffers[base]);
clearFourBuffersKernel.setArg(1, autoclearBufferSizes[base]);
clearFourBuffersKernel.setArg(2, *autoclearBuffers[base+1]);
clearFourBuffersKernel.setArg(3, autoclearBufferSizes[base+1]);
clearFourBuffersKernel.setArg(4, *autoclearBuffers[base+2]);
clearFourBuffersKernel.setArg(5, autoclearBufferSizes[base+2]);
clearFourBuffersKernel.setArg(6, *autoclearBuffers[base+3]);
clearFourBuffersKernel.setArg(7, autoclearBufferSizes[base+3]);
executeKernel(clearFourBuffersKernel, max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), 128);
}
else if (total-base == 3) {
clearThreeBuffersKernel.setArg(0, *autoclearBuffers[base]);
clearThreeBuffersKernel.setArg(1, autoclearBufferSizes[base]);
clearThreeBuffersKernel.setArg(2, *autoclearBuffers[base+1]);
clearThreeBuffersKernel.setArg(3, autoclearBufferSizes[base+1]);
clearThreeBuffersKernel.setArg(4, *autoclearBuffers[base+2]);
clearThreeBuffersKernel.setArg(5, autoclearBufferSizes[base+2]);
executeKernel(clearThreeBuffersKernel, max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), 128);
}
else if (total-base == 2) {
clearTwoBuffersKernel.setArg(0, *autoclearBuffers[base]);
clearTwoBuffersKernel.setArg(1, autoclearBufferSizes[base]);
clearTwoBuffersKernel.setArg(2, *autoclearBuffers[base+1]);
clearTwoBuffersKernel.setArg(3, autoclearBufferSizes[base+1]);
executeKernel(clearTwoBuffersKernel, max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), 128);
}
else if (total-base == 1) {
clearBuffer(*autoclearBuffers[base], autoclearBufferSizes[base]*4);
}
}
void OpenCLContext::reduceForces() {
executeKernel(reduceForcesKernel, paddedNumAtoms, 128);
}
void OpenCLContext::reduceBuffer(OpenCLArray& array, int numBuffers) {
int bufferSize = array.getSize()/numBuffers;
reduceReal4Kernel.setArg(0, array.getDeviceBuffer());
reduceReal4Kernel.setArg(1, bufferSize);
reduceReal4Kernel.setArg(2, numBuffers);
executeKernel(reduceReal4Kernel, bufferSize, 128);
}
double OpenCLContext::reduceEnergy() {
int workGroupSize = device.getInfo();
if (workGroupSize > 512)
workGroupSize = 512;
reduceEnergyKernel.setArg(0, energyBuffer.getDeviceBuffer());
reduceEnergyKernel.setArg(1, energySum.getDeviceBuffer());
reduceEnergyKernel.setArg(2, energyBuffer.getSize());
reduceEnergyKernel.setArg(3, workGroupSize);
reduceEnergyKernel.setArg(4, workGroupSize*energyBuffer.getElementSize(), NULL);
executeKernel(reduceEnergyKernel, workGroupSize, workGroupSize);
if (getUseDoublePrecision() || getUseMixedPrecision()) {
double energy;
energySum.download(&energy);
return energy;
}
else {
float energy;
energySum.download(&energy);
return energy;
}
}
void OpenCLContext::setCharges(const vector& charges) {
if (!chargeBuffer.isInitialized())
chargeBuffer.initialize(*this, numAtoms, useDoublePrecision ? sizeof(double) : sizeof(float), "chargeBuffer");
vector c(numAtoms);
for (int i = 0; i < numAtoms; i++)
c[i] = charges[i];
chargeBuffer.upload(c, true);
setChargesKernel.setArg(0, chargeBuffer.getDeviceBuffer());
setChargesKernel.setArg(1, posq.getDeviceBuffer());
setChargesKernel.setArg(2, atomIndexDevice.getDeviceBuffer());
setChargesKernel.setArg(3, numAtoms);
executeKernel(setChargesKernel, numAtoms);
}
bool OpenCLContext::requestPosqCharges() {
bool allow = !hasAssignedPosqCharges;
hasAssignedPosqCharges = true;
return allow;
}
void OpenCLContext::addEnergyParameterDerivative(const string& param) {
// See if this parameter has already been registered.
for (int i = 0; i < energyParamDerivNames.size(); i++)
if (param == energyParamDerivNames[i])
return;
energyParamDerivNames.push_back(param);
}
void OpenCLContext::flushQueue() {
getQueue().flush();
}